Classification Decision Tree Analysis: Balanced Tree Issue

rstudio

#1

Hello,

My goal when building this decision tree analysis is that I want to see with all the explanatory variables and one response variable ("yes" or "no"), what is out put for such a decision tree?

Sample data set:

Summary
structure(list(MAJOR_CATEGORY_DESCR = structure(c(4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 21L, 12L, 
12L, 12L, 9L), .Label = c("AIR ABRASION", "AMALGAM", "ANESTHETIC", 
"ASEPSIS", "ASPIRATING", "Bleaching", "BURS & DIAMONDS", "CAD/CAM BLOCKS", 
"CEMENT", "COMPOSITE", "COTTON & PAPER", "CROWNS", "ENDO", "FINISH/POLISH", 
"GYPSUM", "HANDPIECES", "IMPRESSION", "INSTRUMENTS", "OTHER CATEGORIES", 
"PINS & POSTS", "PREVENTIVE", "Retraction", "SMALL EQUIP", "ULTRA CLEANER", 
"X-RAY"), class = "factor"), PATT_BRAND_CATEGORY = c(0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L), CUST_BRANCH_DESCR = structure(c(20L, 5L, 5L, 5L, 
5L, 33L, 50L, 15L, 43L, 11L, 17L, 6L, 34L, 7L, 22L, 16L, 5L, 
18L, 43L, 43L, 52L, 17L, 6L, 46L, 37L, 54L, 56L, 11L, 58L, 35L, 
24L, 55L, 8L, 22L, 22L, 46L, 46L, 35L, 5L, 24L, 54L, 41L, 55L, 
2L, 46L, 58L, 27L, 35L, 40L, 5L, 24L, 33L, 51L, 41L, 41L, 19L, 
45L, 30L, 30L, 2L, 22L, 46L, 46L, 46L, 46L, 46L, 58L, 27L, 35L, 
35L, 36L, 18L, 24L, 24L, 3L, 50L, 15L, 45L, 49L, 53L, 53L, 53L, 
53L, 57L, 21L, 30L, 30L, 6L, 22L, 42L, 42L, 46L, 57L, 35L, 45L, 
42L, 3L, 24L, 41L, 15L, 15L), .Label = c("ALBUQUERQUE              ", 
"ATLANTA                  ", "AUSTIN                   ", "BALTIMORE", 
"BIRMINGHAM               ", "BOSTON", "CHARLESTON               ", 
"CHARLOTTE", "CHICAGO                  ", "CINCINNATI", "CLEVELAND", 
"COLUMBIA", "COLUMBUS", "CONNECTICUT", "DENVER                   ", 
"DES MOINES               ", "DETROIT", "DFW                      ", 
"EAST BAY                 ", "FARGO                    ", "GRAND RAPIDS", 
"GREENVILLE               ", "HAWAII", "HOUSTON                  ", 
"INDIANAPOLIS", "JACKSON                  ", "KANSAS CITY              ", 
"LITTLE ROCK              ", "LOS ANGELES              ", "LOUISVILLE", 
"MAINE", "MEDFORD                  ", "MEMPHIS                  ", 
"METRO NY/NJ", "MILWAUKEE                ", "MINNESOTA                ", 
"NASHVILLE                ", "NEW ORLEANS              ", "OKLAHOMA", 
"OMAHA", "ORLANDO                  ", "PHILADELPHIA", "PHOENIX                  ", 
"PITTSBURGH               ", "PORTLAND                 ", "RALEIGH", 
"RICHMOND", "ROCHESTER", "SACRAMENTO               ", "SALT LAKE CITY           ", 
"SAN ANTONIO              ", "SAN DIEGO                ", "SEATTLE                  ", 
"SHREVEPORT               ", "SOUTH BEND", "SOUTH FLORIDA            ", 
"SPOKANE                  ", "ST LOUIS                 ", "TAMPA                    ", 
"WICHITA                  "), class = "factor"), PROGRAM_LEVEL_DESCR = structure(c(10L, 
9L, 10L, 12L, 7L, 7L, 10L, 9L, 9L, 5L, 7L, 10L, 10L, 10L, 2L, 
10L, 9L, 2L, 9L, 9L, 7L, 7L, 10L, 2L, 9L, 7L, 5L, 12L, 5L, 7L, 
9L, 7L, 9L, 9L, 12L, 9L, 9L, 7L, 5L, 9L, 8L, 10L, 7L, 10L, 10L, 
5L, 7L, 2L, 2L, 5L, 9L, 2L, 12L, 10L, 10L, 7L, 12L, 9L, 10L, 
10L, 9L, 8L, 10L, 9L, 9L, 9L, 12L, 12L, 3L, 3L, 10L, 9L, 9L, 
7L, 9L, 12L, 9L, 6L, 10L, 9L, 5L, 6L, 6L, 5L, 9L, 9L, 9L, 7L, 
10L, 2L, 9L, 7L, 12L, 10L, 2L, 10L, 9L, 9L, 10L, 6L, 6L), .Label = c("Branch Refusal", 
"Club", "Corporate Refusal", "Credit Hold", "Diamond", "Enrollment", 
"Gold", "Institutional", "No Program", "Platinum", "RSVP", "Silver"
), class = "factor"), CUST_STATE_KEY = structure(c(29L, 2L, 2L, 
2L, 2L, 26L, 45L, 6L, 4L, 36L, 23L, 20L, 7L, 41L, 41L, 13L, 2L, 
34L, 4L, 4L, 5L, 23L, 20L, 28L, 43L, 19L, 10L, 36L, 15L, 49L, 
44L, 16L, 6L, 6L, 28L, 6L, 6L, 49L, 2L, 44L, 44L, 10L, 16L, 11L, 
28L, 15L, 17L, 49L, 30L, 2L, 44L, 26L, 44L, 10L, 10L, 5L, 38L, 
18L, 18L, 11L, 6L, 6L, 28L, 6L, 6L, 6L, 15L, 25L, 49L, 49L, 24L, 
44L, 44L, 44L, 44L, 45L, 15L, 48L, 34L, 48L, 48L, 48L, 48L, 48L, 
23L, 18L, 18L, 20L, 41L, 39L, 32L, 28L, 51L, 49L, 48L, 39L, 44L, 
5L, 10L, 6L, 6L), .Label = c("AK ", "AL ", "AR ", "AZ ", "CA ", 
"CO ", "CT ", "DC ", "DE ", "FL ", "GA ", "HI ", "IA ", "ID ", 
"IL ", "IN ", "KS ", "KY ", "LA ", "MA ", "MD ", "ME ", "MI ", 
"MN ", "MO ", "MS ", "MT ", "NC ", "ND ", "NE ", "NH ", "NJ ", 
"NM ", "NV ", "NY ", "OH ", "OK ", "OR ", "PA ", "RI ", "SC ", 
"SD ", "TN ", "TX ", "UT ", "VA ", "VT ", "WA ", "WI ", "WV ", 
"WY ", "ZZ "), class = "factor"), CUST_REGION_DESCR = structure(c(2L, 
5L, 5L, 5L, 5L, 6L, 1L, 1L, 1L, 4L, 4L, 3L, 3L, 6L, 6L, 2L, 5L, 
5L, 1L, 1L, 7L, 4L, 3L, 6L, 6L, 5L, 6L, 4L, 2L, 2L, 5L, 4L, 6L, 
6L, 6L, 6L, 6L, 2L, 5L, 5L, 5L, 6L, 4L, 6L, 6L, 2L, 5L, 2L, 2L, 
5L, 5L, 6L, 5L, 6L, 6L, 7L, 1L, 4L, 4L, 6L, 6L, 6L, 6L, 6L, 6L, 
6L, 2L, 5L, 2L, 2L, 2L, 5L, 5L, 5L, 5L, 1L, 1L, 1L, 7L, 1L, 1L, 
1L, 1L, 1L, 4L, 4L, 4L, 3L, 6L, 3L, 3L, 6L, 1L, 2L, 1L, 3L, 5L, 
5L, 6L, 1L, 1L), .Label = c("MOUNTAIN WEST REGION", "NORTH CENTRAL REGION", 
"NORTH EAST REGION        ", "OHIO VALLEY REGION       ", "SOUTH CENTRAL REGION     ", 
"SOUTH EAST REGION        ", "WESTERN REGION           "), class = "factor"), 
    COMM_SUB_LINE_DESCR = structure(c(5L, 5L, 5L, 5L, 5L, 5L, 
    5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 
    5L, 5L, 5L, 5L, 5L, 5L, 5L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
    3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
    3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
    5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 
    5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 
    5L, 5L, 5L, 5L, 5L), .Label = c("EQUIPMENT", "Handpieces", 
    "PATTERSON BRAND", "PROFESSIONAL SERVICES", "SUNDRIES"), class = "factor"), 
    Sales = c(71.52, 118.3, 114, 131.5, 263, 112.6, 62.45, 12.83, 
    26.28, 394.5, 161, 90, 112.6, 729, 99.5, 157.8, 118.3, 181.65, 
    87.6, 87.6, 263, 80.5, 90, 319.68, 132.5, 131.5, 228, 8, 
    25.94, 13.54, 28.5, 28.5, 57, 22.8, 14.25, 34.2, 34.2, 13.54, 
    14.25, 28.5, 51.28, 28.5, 57, 12.82, 14.25, 72.34, 79.5, 
    35.77, 39.75, 79.5, 79.5, 32.19, 79.5, 79.5, 79.5, 35.77, 
    79.5, 39.75, 35.77, 71.54, 31.8, 95.4, 39.75, 127.2, 63.6, 
    31.8, 76.12, 83.8, 66.2, 13.24, 53.67, 33.84, 42.3, 83.8, 
    16.64, 56.97, 36.72, 20.95, 50.76, 101.52, 120.26, 18.23, 
    60.96, 142.24, 42.3, 16.4, 67.8, 41.9, 59.85, 51.39, 105.75, 
    33.9, 48.15, 0, 49.2, 71.7, 20.65, 30.94, 33.57, 35.55, 718.2
    ), QtySold = c(6L, 10L, 8L, 10L, 20L, 10L, 5L, 1L, 3L, 30L, 
    20L, 10L, 10L, 90L, 10L, 12L, 10L, 15L, 10L, 10L, 20L, 10L, 
    10L, 32L, 10L, 10L, 20L, 1L, 2L, 1L, 2L, 2L, 5L, 2L, 1L, 
    3L, 3L, 1L, 1L, 2L, 4L, 2L, 4L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 
    2L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 3L, 1L, 4L, 2L, 
    1L, 4L, 4L, 5L, 1L, 3L, 2L, 2L, 4L, 1L, 3L, 3L, 1L, 3L, 6L, 
    7L, 1L, 3L, 7L, 2L, 1L, 4L, 2L, 3L, 3L, 5L, 2L, 1L, 1L, 4L, 
    6L, 1L, 1L, 1L, 1L, 2L), MFGCOST = c(44.7, 74.5, 59.6, 74.5, 
    149, 74.5, 37.25, 7.45, 22.35, 223.5, 149, 74.5, 74.5, 670.5, 
    74.5, 89.4, 74.5, 111.75, 74.5, 74.5, 149, 74.5, 74.5, 238.4, 
    74.5, 74.5, 149, 7.45, 13.2, 6.6, 13.2, 13.2, 33, 13.2, 6.6, 
    19.8, 19.8, 6.6, 6.6, 13.2, 26.4, 13.2, 26.4, 6.6, 6.6, 38, 
    38, 19, 19, 38, 38, 19, 38, 38, 38, 19, 38, 19, 19, 38, 19, 
    57, 19, 76, 38, 19, 45, 45, 56.25, 11.25, 33.75, 22.5, 22.5, 
    45, 11.25, 33.75, 33.75, 11.25, 33.75, 67.5, 78.75, 11.25, 
    33.75, 78.75, 22.5, 11.25, 45, 22.5, 33.75, 33.75, 56.25, 
    22.5, 26.6, 7.77, 31.08, 46.62, 15.75, 25.58, 25.58, 25.58, 
    486), MarginDollars = c(25.92, 42.3, 53.2, 55.5, 111, 36.6, 
    24.45, 5.23, 3.48, 166.5, 9, 14, 36.6, 45, 23.5, 66.6, 42.3, 
    67.65, 11.6, 11.6, 111, 4.5, 14, 76.48, 56.5, 55.5, 76, 0.4, 
    11.72, 6.43, 14.28, 14.28, 21.45, 8.58, 7.14, 12.87, 12.87, 
    6.43, 7.14, 14.28, 22.84, 14.28, 28.56, 5.71, 7.14, 32.6, 
    39.76, 15.9, 19.88, 39.76, 39.76, 12.32, 39.76, 39.76, 39.76, 
    15.9, 39.76, 19.88, 15.9, 31.8, 11.93, 35.79, 19.88, 47.72, 
    23.86, 11.93, 30.2, 37.88, 8.8, 1.76, 19.23, 10.88, 19.34, 
    37.88, 5.16, 22.53, 2.28, 9.47, 16.32, 32.64, 39.9, 6.75, 
    26.52, 61.88, 19.34, 4.92, 21.88, 18.94, 25.41, 16.95, 48.35, 
    10.94, 21.02, -7.62, 18.72, 25.98, 4.58, 4.85, 7.48, 9.46, 
    222.48), New_Product_Type = structure(c(1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
    2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
    2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
    2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L), .Label = c("no", "yes"), class = "factor")), row.names = 100:200, class = "data.frame")

Here is the structure of df:

Summary
> str(df)
'data.frame':	51433 obs. of  12 variables:
 $ MAJOR_CATEGORY_DESCR: Factor w/ 25 levels "AIR ABRASION",..: 23 23 23 23 21 21 21 23 23 23 ...
 $ PATT_BRAND_CATEGORY : int  0 0 0 0 0 0 0 0 0 0 ...
 $ CUST_BRANCH_DESCR   : Factor w/ 60 levels "ALBUQUERQUE              ",..: 42 46 7 42 3 22 22 35 45 32 ...
 $ PROGRAM_LEVEL_DESCR : Factor w/ 12 levels "Branch Refusal",..: 10 10 9 7 9 9 10 10 5 9 ...
 $ CUST_STATE_KEY      : Factor w/ 52 levels "AK ","AL ","AR ",..: 39 28 11 32 44 28 28 49 38 5 ...
 $ CUST_REGION_DESCR   : Factor w/ 7 levels "MOUNTAIN WEST REGION",..: 3 6 6 3 5 6 6 2 1 1 ...
 $ COMM_SUB_LINE_DESCR : Factor w/ 5 levels "EQUIPMENT","Handpieces",..: 5 5 5 5 5 5 5 5 5 5 ...
 $ Sales               : num  210 -76.2 275.6 138.7 226 ...
 $ QtySold             : int  3 -1 2 1 2 1 2 1 1 2 ...
 $ MFGCOST             : num  134.6 -44.9 162.5 81.2 136 ...
 $ MarginDollars       : num  72.7 -30.4 109.8 55.8 87.3 ...
 $ New_Product_Type    : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...

I picked my product type (3 types: House Brand, Sundry and Handpieces) to be the main attribute to split the tree.

I then created a column called "New_Product_Type" which basically sorts out two binary values: yes = House Brand and no = other types.

My decision is balanced.

I am looking forward to a quite diverse unbalanced tree as a result, so I interpreted this as an error in my analysis.

I cannot even see what the root is.

Why is the tree split this way? I meant it is great that it is split but it does not give me any insight or a powerful reason whether to invest in the House Brand Product.

Here is my code:

#df is from the sample data frame above
set.seed(415)
train = sample(1:nrow(df), 0.8*nrow(df)) # training row indices

training_data = df[train,] # training data
testing_data = df[-train,] # testing data

#fit the tree model using training data
tree_model2 = rpart(New_Product_Type ~.,data = training_data)
plot(tree_model2)
text(tree_model2, pretty = 0, cex = 0.1)

summary(tree_model2)
out = predict(tree_model2) # predict the training data

input.newproduct = as.character(training_data$new_ProductName) # actuals
pred.newproduct = colnames(out)[max.col(out,ties.method = c("first"))] # predicted
mean (input.newproduct != pred.newproduct) # misclassification % 

# Cross Validation to see how much we need to prune the tree
set.seed(400)
cv_Tree = cv.tree(tree_model, FUN = prune.misclass) # run cross validation
names(cv_Tree)
attach(cv_Tree)
plot(cv_Tree, cex = 0.8) # plot the CV
plot(size, dev, type = "b")

treePruneMod = prune.misclass(tree_model, best = 6) # set size corresponding to lowest value in the plot above.
plot(treePruneMod)
text(treePruneMod, pretty = 0)


out = predict(treePruneMod,newdata = testing_data) # fit the pruned tree
pred.newproduct = colnames(out)[max.col(out,ties.method = c("random"))] # predicted
mean(testing_data$new_ProductName != pred.newproduct) # calculate Mis-classification error
out = predict(treePruneMod, testing_data, type = "class") # Predict testData with Pruned tree