I am working on a task where I have implemented the K-prototypes clustering method in R. I have mixed-type data comprising numerical and categorical features. From the K-prototypes documentation, I know that we have to convert the character columns to factors before running the clustering algorithm which I did. The kproto
function is working fine with my data but when I tried to plot this it gave this error: Error in colMeans(x, na.rm = TRUE) : 'x' must be numeric
. I know there are some related queries on Stack Overflow but I want to try something like mine.
Data Sample:
structure(list(mask = structure(c(1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1
), label = "mask", format.stata = "%10.0g", labels = c(Non = 0,
Oui = 1), class = c("haven_labelled", "vctrs_vctr", "double")),
year = structure(c(54, 28, 57, 37, 44, 32, 33, 58, 34, 41,
47, 40, 33, 41, 31, 51, 39, 49, 54, 34, 43, 39, 33, 18, 53,
31, 43, 66, 21, 18), label = "year", format.stata = "%10.0g"),
gender = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0), label = "gender", format.stata = "%10.0g", labels = c(Femme = 0,
Homme = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), marital_status = structure(c(1, 1, 2, 1, 2, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 0, 1, 0, 2,
1, 0, 0), label = "marital_status", format.stata = "%15.0g", labels = c(Célibataire = 0,
`Marié monogame` = 1, `Marié polygame` = 2, `Veuf(ve)` = 3
), class = c("haven_labelled", "vctrs_vctr", "double")),
educ = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), label = "educ", format.stata = "%27.0g", labels = c(`Non scolarisé` = 0,
`Scolarisé(niveau primaire)` = 1), class = c("haven_labelled",
"vctrs_vctr", "double")), own = structure(c(2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1,
2, 2, 2, 2, 2, 2), label = "own", format.stata = "%18.0g", labels = c(`Logé gratuitement` = 0,
Propriétaire = 1, Locataire = 2), class = c("haven_labelled",
"vctrs_vctr", "double")), inc_per_month = structure(c("[50005-75000[",
"[50005-75000[", "[0-50000[", "[0-50000[", "[0-50000[", "[0-50000[",
"[0-50000[", "[0-50000[", "[0-50000[", "[0-50000[", "[0-50000[",
"[0-50000[", "[0-50000[", "[0-50000[", "[0-50000[", "[0-50000[",
"[0-50000[", "[0-50000[", "[0-50000[", "[0-50000[", "[0-50000[",
"[0-50000[", "[0-50000[", "[0-50000[", "[0-50000[", "[0-50000[",
"[0-50000[", "[0-50000[", "[0-50000[", "[0-50000["), label = "inc_per_month", format.stata = "%13s"),
evo_income = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), label = "evo_income", format.stata = "%10.0g", labels = c(Non = 0,
Oui = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), workday = structure(c(6, 7, 7, 7, 7, 7, 7, 6, 7, 7, 7,
7, 7, 7, 7, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 7, 6), label = "workday", format.stata = "%10.0g", labels = c(`5 jours` = 5,
`6 jours` = 6, `7 jours` = 7), class = c("haven_labelled",
"vctrs_vctr", "double")), daily_hour = structure(c(6, 8,
8, 6, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 7, 10, 8, 8, 10, 10,
10, 8, 10, 6, 10, 7, 10, 8, 6), label = "daily_hour", format.stata = "%10.0g"),
work_exp = structure(c(15, 5, 20, 10, 24, 8, 6, 24, 6, 10,
18, 6, 5, 8, 5, 20, 8, 6, 28, 7, 10, 8, 6, 5, 24, 5, 20,
12, 5, 2), label = "work_exp", format.stata = "%10.0g"),
env_awareness = structure(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1), label = "env_awareness", format.stata = "%10.0g", labels = c(Oui = 1,
Non = 2), class = c("haven_labelled", "vctrs_vctr", "double"
)), injury = structure(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), label = "injury", format.stata = "%10.0g", labels = c(Non = 0,
Oui = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), contam1 = structure(c(1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), label = "contam1", format.stata = "%10.0g", labels = c(Non = 0,
Oui = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), contam2 = structure(c(1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), label = "contam2", format.stata = "%10.0g", labels = c(Non = 0,
Oui = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), fire = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), label = "fire", format.stata = "%10.0g", labels = c(Non = 0,
Oui = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), other_risk = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0), label = "other_risk", format.stata = "%10.0g", labels = c(Non = 0,
Oui = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), plast = structure(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), label = "plast", format.stata = "%10.0g", labels = c(Non = 0,
Oui = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), paper_card = structure(c(0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0), label = "paper_card", format.stata = "%10.0g", labels = c(Non = 0,
Oui = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), metal = structure(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1), label = "metal", format.stata = "%10.0g", labels = c(Non = 0,
Oui = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), glass = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), label = "glass", format.stata = "%10.0g", labels = c(Non = 0,
Oui = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), wood = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0), label = "wood", format.stata = "%10.0g", labels = c(Non = 0,
Oui = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), textile = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), label = "textile", format.stata = "%10.0g", labels = c(Non = 0,
Oui = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), org = structure(c(1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1), label = "org", format.stata = "%10.0g", labels = c(Non = 0,
Oui = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), kg_per_day = structure(c(5, 4, 4, 3, 5, 4, 5, 5, 4, 5,
5, 4, 5, 5, 5, 5, 4, 5, 5, 7, 5, 5, 5, 4, 4, 4, 4, 4, 4,
4), label = "kg_per_day", format.stata = "%10.0g"), stigma = structure(c(4,
4, 2, 4, 3, 3, 2, 2, 3, 3, 4, 2, 2, 2, 2, 3, 2, 2, 3, 2,
3, 3, 1, 3, 2, 2, 1, 4, 3, 4), label = "stigma", format.stata = "%23.0g", labels = c(`Pas du tout stigmatisé` = 1,
`Peu stigmatisé` = 2, Stigmatisé = 3, `Très stigmatisé` = 4
), class = c("haven_labelled", "vctrs_vctr", "double")),
prog = structure(c(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), label = "prog", format.stata = "%10.0g", labels = c(Non = 0,
Oui = 1), class = c("haven_labelled", "vctrs_vctr", "double"
)), scenario = structure(c(".", "2", "2", "2", "2", "2",
"4", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", ".",
"2", ".", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2"
), label = "scenario", format.stata = "%9s")), row.names = c(NA,
-30L), class = c("tbl_df", "tbl", "data.frame"))
Code:
# Identify character variables and converting them into Factors
data_fac <- data_new
char_cols <- sapply(data_new, is.character)
for (i in 1:length(char_cols)) {
if (char_cols[i]) {
data_fac[[i]] <- as.factor(data[[i]])
}
}
# Convert categorical variables to factors
data_fac$mask <- as.factor(data_fac$mask)
data_fac$gender <- as.factor(data_fac$gender)
data_fac$marital_status <- as.factor(data_fac$marital_status)
data_fac$educ <- as.factor(data_fac$educ)
data_fac$own <- as.factor(data_fac$own)
data_fac$inc_per_month <- as.factor(data_fac$inc_per_month)
data_fac$evo_income <- as.factor(data_fac$evo_income)
# K-Prototypes with Gower distance and 6 clusters
kp <- kproto(data_fac, k = 6, diss = "gower")
# extract cluster assignments
cluster_assignments <- kp$cluster
fviz_cluster(kp,
palette = viridis(10),
ellipse.type = "t",
ellipse.level = 0.95,
repel = TRUE,
ggtheme = theme_gray(base_size = 12, base_family = "sans"),
main = "Cluster Plot",
xlab = "PC1",
ylab = "PC2",
addEllipses = TRUE,
show.clust.cent = TRUE,
pointsize = 2,
labelsize = 3.5,
title.size = 0.8,
legend.title = "Cluster",
legend.position = "right",
legend.labsize = 8,
legend.text.color = "black")