K-prototype Clustering in R

I am working on a task where I have implemented the K-prototypes clustering method in R. I have mixed-type data comprising numerical and categorical features. From the K-prototypes documentation, I know that we have to convert the character columns to factors before running the clustering algorithm which I did. The kproto function is working fine with my data but when I tried to plot this it gave this error: Error in colMeans(x, na.rm = TRUE) : 'x' must be numeric. I know there are some related queries on Stack Overflow but I want to try something like mine.

Data Sample:

structure(list(mask = structure(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1
), label = "mask", format.stata = "%10.0g", labels = c(Non = 0, 
Oui = 1), class = c("haven_labelled", "vctrs_vctr", "double")), 
    year = structure(c(54, 28, 57, 37, 44, 32, 33, 58, 34, 41, 
    47, 40, 33, 41, 31, 51, 39, 49, 54, 34, 43, 39, 33, 18, 53, 
    31, 43, 66, 21, 18), label = "year", format.stata = "%10.0g"), 
    gender = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0), label = "gender", format.stata = "%10.0g", labels = c(Femme = 0, 
    Homme = 1), class = c("haven_labelled", "vctrs_vctr", "double"
    )), marital_status = structure(c(1, 1, 2, 1, 2, 1, 1, 1, 
    1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 0, 1, 0, 2, 
    1, 0, 0), label = "marital_status", format.stata = "%15.0g", labels = c(Célibataire = 0, 
    `Marié monogame` = 1, `Marié polygame` = 2, `Veuf(ve)` = 3
    ), class = c("haven_labelled", "vctrs_vctr", "double")), 
    educ = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), label = "educ", format.stata = "%27.0g", labels = c(`Non scolarisé` = 0, 
    `Scolarisé(niveau primaire)` = 1), class = c("haven_labelled", 
    "vctrs_vctr", "double")), own = structure(c(2, 2, 2, 2, 2, 
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 
    2, 2, 2, 2, 2, 2), label = "own", format.stata = "%18.0g", labels = c(`Logé gratuitement` = 0, 
    Propriétaire = 1, Locataire = 2), class = c("haven_labelled", 
    "vctrs_vctr", "double")), inc_per_month = structure(c("[50005-75000[", 
    "[50005-75000[", "[0-50000[", "[0-50000[", "[0-50000[", "[0-50000[", 
    "[0-50000[", "[0-50000[", "[0-50000[", "[0-50000[", "[0-50000[", 
    "[0-50000[", "[0-50000[", "[0-50000[", "[0-50000[", "[0-50000[", 
    "[0-50000[", "[0-50000[", "[0-50000[", "[0-50000[", "[0-50000[", 
    "[0-50000[", "[0-50000[", "[0-50000[", "[0-50000[", "[0-50000[", 
    "[0-50000[", "[0-50000[", "[0-50000[", "[0-50000["), label = "inc_per_month", format.stata = "%13s"), 
    evo_income = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), label = "evo_income", format.stata = "%10.0g", labels = c(Non = 0, 
    Oui = 1), class = c("haven_labelled", "vctrs_vctr", "double"
    )), workday = structure(c(6, 7, 7, 7, 7, 7, 7, 6, 7, 7, 7, 
    7, 7, 7, 7, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 7, 6), label = "workday", format.stata = "%10.0g", labels = c(`5 jours` = 5, 
    `6 jours` = 6, `7 jours` = 7), class = c("haven_labelled", 
    "vctrs_vctr", "double")), daily_hour = structure(c(6, 8, 
    8, 6, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 7, 10, 8, 8, 10, 10, 
    10, 8, 10, 6, 10, 7, 10, 8, 6), label = "daily_hour", format.stata = "%10.0g"), 
    work_exp = structure(c(15, 5, 20, 10, 24, 8, 6, 24, 6, 10, 
    18, 6, 5, 8, 5, 20, 8, 6, 28, 7, 10, 8, 6, 5, 24, 5, 20, 
    12, 5, 2), label = "work_exp", format.stata = "%10.0g"), 
    env_awareness = structure(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
    1), label = "env_awareness", format.stata = "%10.0g", labels = c(Oui = 1, 
    Non = 2), class = c("haven_labelled", "vctrs_vctr", "double"
    )), injury = structure(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), label = "injury", format.stata = "%10.0g", labels = c(Non = 0, 
    Oui = 1), class = c("haven_labelled", "vctrs_vctr", "double"
    )), contam1 = structure(c(1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), label = "contam1", format.stata = "%10.0g", labels = c(Non = 0, 
    Oui = 1), class = c("haven_labelled", "vctrs_vctr", "double"
    )), contam2 = structure(c(1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), label = "contam2", format.stata = "%10.0g", labels = c(Non = 0, 
    Oui = 1), class = c("haven_labelled", "vctrs_vctr", "double"
    )), fire = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 
    0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), label = "fire", format.stata = "%10.0g", labels = c(Non = 0, 
    Oui = 1), class = c("haven_labelled", "vctrs_vctr", "double"
    )), other_risk = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0), label = "other_risk", format.stata = "%10.0g", labels = c(Non = 0, 
    Oui = 1), class = c("haven_labelled", "vctrs_vctr", "double"
    )), plast = structure(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), label = "plast", format.stata = "%10.0g", labels = c(Non = 0, 
    Oui = 1), class = c("haven_labelled", "vctrs_vctr", "double"
    )), paper_card = structure(c(0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 
    0), label = "paper_card", format.stata = "%10.0g", labels = c(Non = 0, 
    Oui = 1), class = c("haven_labelled", "vctrs_vctr", "double"
    )), metal = structure(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
    1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1), label = "metal", format.stata = "%10.0g", labels = c(Non = 0, 
    Oui = 1), class = c("haven_labelled", "vctrs_vctr", "double"
    )), glass = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), label = "glass", format.stata = "%10.0g", labels = c(Non = 0, 
    Oui = 1), class = c("haven_labelled", "vctrs_vctr", "double"
    )), wood = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0), label = "wood", format.stata = "%10.0g", labels = c(Non = 0, 
    Oui = 1), class = c("haven_labelled", "vctrs_vctr", "double"
    )), textile = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), label = "textile", format.stata = "%10.0g", labels = c(Non = 0, 
    Oui = 1), class = c("haven_labelled", "vctrs_vctr", "double"
    )), org = structure(c(1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1), label = "org", format.stata = "%10.0g", labels = c(Non = 0, 
    Oui = 1), class = c("haven_labelled", "vctrs_vctr", "double"
    )), kg_per_day = structure(c(5, 4, 4, 3, 5, 4, 5, 5, 4, 5, 
    5, 4, 5, 5, 5, 5, 4, 5, 5, 7, 5, 5, 5, 4, 4, 4, 4, 4, 4, 
    4), label = "kg_per_day", format.stata = "%10.0g"), stigma = structure(c(4, 
    4, 2, 4, 3, 3, 2, 2, 3, 3, 4, 2, 2, 2, 2, 3, 2, 2, 3, 2, 
    3, 3, 1, 3, 2, 2, 1, 4, 3, 4), label = "stigma", format.stata = "%23.0g", labels = c(`Pas du tout stigmatisé` = 1, 
    `Peu stigmatisé` = 2, Stigmatisé = 3, `Très stigmatisé` = 4
    ), class = c("haven_labelled", "vctrs_vctr", "double")), 
    prog = structure(c(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
    1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), label = "prog", format.stata = "%10.0g", labels = c(Non = 0, 
    Oui = 1), class = c("haven_labelled", "vctrs_vctr", "double"
    )), scenario = structure(c(".", "2", "2", "2", "2", "2", 
    "4", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", ".", 
    "2", ".", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2"
    ), label = "scenario", format.stata = "%9s")), row.names = c(NA, 
-30L), class = c("tbl_df", "tbl", "data.frame"))

Code:

# Identify character variables and converting them into Factors

data_fac <- data_new
char_cols <- sapply(data_new, is.character)

for (i in 1:length(char_cols)) {
  if (char_cols[i]) {
    data_fac[[i]] <- as.factor(data[[i]])
  }
}
# Convert categorical variables to factors

data_fac$mask <- as.factor(data_fac$mask)
data_fac$gender <- as.factor(data_fac$gender)
data_fac$marital_status <- as.factor(data_fac$marital_status)
data_fac$educ <- as.factor(data_fac$educ)
data_fac$own <- as.factor(data_fac$own)
data_fac$inc_per_month <- as.factor(data_fac$inc_per_month)
data_fac$evo_income <- as.factor(data_fac$evo_income)

# K-Prototypes with Gower distance and 6 clusters
kp <- kproto(data_fac, k = 6, diss = "gower")

# extract cluster assignments
cluster_assignments <- kp$cluster

fviz_cluster(kp, 
             palette = viridis(10),
             ellipse.type = "t",
             ellipse.level = 0.95,
             repel = TRUE,
             ggtheme = theme_gray(base_size = 12, base_family = "sans"),
             main = "Cluster Plot",
             xlab = "PC1",
             ylab = "PC2",
             addEllipses = TRUE,
             show.clust.cent = TRUE,
             pointsize = 2,
             labelsize = 3.5,
             title.size = 0.8,
             legend.title = "Cluster",
             legend.position = "right",
             legend.labsize = 8,
             legend.text.color = "black")

This topic was automatically closed 42 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.