# Decision Tree using Rpart and tree

I am very new to R and I am trying to learn

How can I draw an optimal tree and calculate the complexity parameter (cp)?

``````> library(datasets)
> dim(iris)
[1] 150   5
> library(rpart)
``````

I have generated the following model:

``````iris.tree <- rpart(formula = Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, method="class", data=iris)
``````

...What is the difference between using rpart and tree?

They create different objects and use surrogate variables differently.

``````library(rpart)
iris_tree <- rpart(formula = Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, method="class", data=iris)
plot(iris_tree)
str(iris_tree)
#> List of 14
#>  \$ frame              :'data.frame': 5 obs. of  9 variables:
#>   ..\$ var       : chr [1:5] "Petal.Length" "<leaf>" "Petal.Width" "<leaf>" ...
#>   ..\$ n         : int [1:5] 150 50 100 54 46
#>   ..\$ wt        : num [1:5] 150 50 100 54 46
#>   ..\$ dev       : num [1:5] 100 0 50 5 1
#>   ..\$ yval      : num [1:5] 1 1 2 2 3
#>   ..\$ complexity: num [1:5] 0.5 0.01 0.44 0 0.01
#>   ..\$ ncompete  : int [1:5] 3 0 3 0 0
#>   ..\$ nsurrogate: int [1:5] 3 0 3 0 0
#>   ..\$ yval2     : num [1:5, 1:8] 1 1 2 2 3 50 50 0 0 0 ...
#>   .. ..- attr(*, "dimnames")=List of 2
#>   .. .. ..\$ : NULL
#>   .. .. ..\$ : chr [1:8] "" "" "" "" ...
#>  \$ where              : Named int [1:150] 2 2 2 2 2 2 2 2 2 2 ...
#>   ..- attr(*, "names")= chr [1:150] "1" "2" "3" "4" ...
#>  \$ call               : language rpart(formula = Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width,      data = iris, method = "class")
#>  \$ terms              :Classes 'terms', 'formula'  language Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width
#>   .. ..- attr(*, "variables")= language list(Species, Sepal.Length, Sepal.Width, Petal.Length, Petal.Width)
#>   .. ..- attr(*, "factors")= int [1:5, 1:4] 0 1 0 0 0 0 0 1 0 0 ...
#>   .. .. ..- attr(*, "dimnames")=List of 2
#>   .. .. .. ..\$ : chr [1:5] "Species" "Sepal.Length" "Sepal.Width" "Petal.Length" ...
#>   .. .. .. ..\$ : chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
#>   .. ..- attr(*, "term.labels")= chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
#>   .. ..- attr(*, "order")= int [1:4] 1 1 1 1
#>   .. ..- attr(*, "intercept")= int 1
#>   .. ..- attr(*, "response")= int 1
#>   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
#>   .. ..- attr(*, "predvars")= language list(Species, Sepal.Length, Sepal.Width, Petal.Length, Petal.Width)
#>   .. ..- attr(*, "dataClasses")= Named chr [1:5] "factor" "numeric" "numeric" "numeric" ...
#>   .. .. ..- attr(*, "names")= chr [1:5] "Species" "Sepal.Length" "Sepal.Width" "Petal.Length" ...
#>  \$ cptable            : num [1:3, 1:5] 0.5 0.44 0.01 0 1 2 1 0.5 0.06 1.16 ...
#>   ..- attr(*, "dimnames")=List of 2
#>   .. ..\$ : chr [1:3] "1" "2" "3"
#>   .. ..\$ : chr [1:5] "CP" "nsplit" "rel error" "xerror" ...
#>  \$ method             : chr "class"
#>  \$ parms              :List of 3
#>   ..\$ prior: num [1:3(1d)] 0.333 0.333 0.333
#>   .. ..- attr(*, "dimnames")=List of 1
#>   .. .. ..\$ : chr [1:3] "1" "2" "3"
#>   ..\$ loss : num [1:3, 1:3] 0 1 1 1 0 1 1 1 0
#>   ..\$ split: num 1
#>  \$ control            :List of 9
#>   ..\$ minsplit      : int 20
#>   ..\$ minbucket     : num 7
#>   ..\$ cp            : num 0.01
#>   ..\$ maxcompete    : int 4
#>   ..\$ maxsurrogate  : int 5
#>   ..\$ usesurrogate  : int 2
#>   ..\$ surrogatestyle: int 0
#>   ..\$ maxdepth      : int 30
#>   ..\$ xval          : int 10
#>  \$ functions          :List of 3
#>   ..\$ summary:function (yval, dev, wt, ylevel, digits)
#>   ..\$ print  :function (yval, ylevel, digits)
#>   ..\$ text   :function (yval, dev, wt, ylevel, digits, n, use.n)
#>  \$ numresp            : int 5
#>  \$ splits             : num [1:14, 1:5] 150 150 150 150 0 0 0 100 100 100 ...
#>   ..- attr(*, "dimnames")=List of 2
#>   .. ..\$ : chr [1:14] "Petal.Length" "Petal.Width" "Sepal.Length" "Sepal.Width" ...
#>   .. ..\$ : chr [1:5] "count" "ncat" "improve" "index" ...
#>  \$ variable.importance: Named num [1:4] 89 81.3 54.1 36
#>   ..- attr(*, "names")= chr [1:4] "Petal.Width" "Petal.Length" "Sepal.Length" "Sepal.Width"
#>  \$ y                  : int [1:150] 1 1 1 1 1 1 1 1 1 1 ...
#>  \$ ordered            : Named logi [1:4] FALSE FALSE FALSE FALSE
#>   ..- attr(*, "names")= chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
#>  - attr(*, "xlevels")= Named list()
#>  - attr(*, "ylevels")= chr [1:3] "setosa" "versicolor" "virginica"
#>  - attr(*, "class")= chr "rpart"
library(DAAG)
``````

``````library(tree)
iris_tr <- tree(Species ~., iris )
plot(iris_tr)
``````

``````str(iris_tr)
#> List of 6
#>  \$ frame  :'data.frame': 11 obs. of  6 variables:
#>   ..\$ var   : Factor w/ 5 levels "<leaf>","Sepal.Length",..: 4 1 5 4 2 1 1 1 4 1 ...
#>   ..\$ n     : num [1:11] 150 50 100 54 48 5 43 6 46 6 ...
#>   ..\$ dev   : num [1:11] 329.58 0 138.63 33.32 9.72 ...
#>   ..\$ yval  : Factor w/ 3 levels "setosa","versicolor",..: 1 1 2 2 2 2 2 3 3 3 ...
#>   ..\$ splits: chr [1:11, 1:2] "<2.45" "" "<1.75" "<4.95" ...
#>   .. ..- attr(*, "dimnames")=List of 2
#>   .. .. ..\$ : NULL
#>   .. .. ..\$ : chr [1:2] "cutleft" "cutright"
#>   ..\$ yprob : num [1:11, 1:3] 0.333 1 0 0 0 ...
#>   .. ..- attr(*, "dimnames")=List of 2
#>   .. .. ..\$ : NULL
#>   .. .. ..\$ : chr [1:3] "setosa" "versicolor" "virginica"
#>  \$ where  : Named int [1:150] 2 2 2 2 2 2 2 2 2 2 ...
#>   ..- attr(*, "names")= chr [1:150] "1" "2" "3" "4" ...
#>  \$ terms  :Classes 'terms', 'formula'  language Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width
#>   .. ..- attr(*, "variables")= language list(Species, Sepal.Length, Sepal.Width, Petal.Length, Petal.Width)
#>   .. ..- attr(*, "factors")= int [1:5, 1:4] 0 1 0 0 0 0 0 1 0 0 ...
#>   .. .. ..- attr(*, "dimnames")=List of 2
#>   .. .. .. ..\$ : chr [1:5] "Species" "Sepal.Length" "Sepal.Width" "Petal.Length" ...
#>   .. .. .. ..\$ : chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
#>   .. ..- attr(*, "term.labels")= chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
#>   .. ..- attr(*, "order")= int [1:4] 1 1 1 1
#>   .. ..- attr(*, "intercept")= int 1
#>   .. ..- attr(*, "response")= int 1
#>   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
#>   .. ..- attr(*, "predvars")= language list(Species, Sepal.Length, Sepal.Width, Petal.Length, Petal.Width)
#>   .. ..- attr(*, "dataClasses")= Named chr [1:5] "factor" "numeric" "numeric" "numeric" ...
#>   .. .. ..- attr(*, "names")= chr [1:5] "Species" "Sepal.Length" "Sepal.Width" "Petal.Length" ...
#>  \$ call   : language tree(formula = Species ~ ., data = iris)
#>  \$ y      : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
#>   ..- attr(*, "names")= chr [1:150] "1" "2" "3" "4" ...
#>  \$ weights: num [1:150] 1 1 1 1 1 1 1 1 1 1 ...
#>  - attr(*, "class")= chr "tree"
#>  - attr(*, "xlevels")=List of 4
#>   ..\$ Sepal.Length: NULL
#>   ..\$ Sepal.Width : NULL
#>   ..\$ Petal.Length: NULL
#>   ..\$ Petal.Width : NULL
#>  - attr(*, "ylevels")= chr [1:3] "setosa" "versicolor" "virginica"

# from help(rpart)
# This differs from the tree function in S mainly in its handling of surrogate variables. In most details it follows Breiman et. al (1984) quite closely. R package tree provides a re-implementation of tree.

# from the long intro vignette for rpart
# Once a splitting variable and a split point for it have been decided, what is to be done with observations missing that variable? One approach is to estimate the missing datum using the other independent variables; rpart uses a variation of this to define surrogate variables.
``````

Created on 2022-12-08 by the reprex package (v2.0.1)

@technocrat What would be the complexity parameter (cp) and the error rate of both?
Thank you very much

Is there any other library that does something similar?

``````library(rpart)
iris_tree <- rpart(formula = Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, method="class", data=iris)
summary(iris_tree)
#> Call:
#> rpart(formula = Species ~ Sepal.Length + Sepal.Width + Petal.Length +
#>     Petal.Width, data = iris, method = "class")
#>   n= 150
#>
#>     CP nsplit rel error xerror       xstd
#> 1 0.50      0      1.00   1.18 0.05017303
#> 2 0.44      1      0.50   0.74 0.06123180
#> 3 0.01      2      0.06   0.10 0.03055050
#>
#> Variable importance
#>  Petal.Width Petal.Length Sepal.Length  Sepal.Width
#>           34           31           21           14
#>
#> Node number 1: 150 observations,    complexity param=0.5
#>   predicted class=setosa      expected loss=0.6666667  P(node) =1
#>     class counts:    50    50    50
#>    probabilities: 0.333 0.333 0.333
#>   left son=2 (50 obs) right son=3 (100 obs)
#>   Primary splits:
#>       Petal.Length < 2.45 to the left,  improve=50.00000, (0 missing)
#>       Petal.Width  < 0.8  to the left,  improve=50.00000, (0 missing)
#>       Sepal.Length < 5.45 to the left,  improve=34.16405, (0 missing)
#>       Sepal.Width  < 3.35 to the right, improve=19.03851, (0 missing)
#>   Surrogate splits:
#>       Petal.Width  < 0.8  to the left,  agree=1.000, adj=1.00, (0 split)
#>       Sepal.Length < 5.45 to the left,  agree=0.920, adj=0.76, (0 split)
#>       Sepal.Width  < 3.35 to the right, agree=0.833, adj=0.50, (0 split)
#>
#> Node number 2: 50 observations
#>   predicted class=setosa      expected loss=0  P(node) =0.3333333
#>     class counts:    50     0     0
#>    probabilities: 1.000 0.000 0.000
#>
#> Node number 3: 100 observations,    complexity param=0.44
#>   predicted class=versicolor  expected loss=0.5  P(node) =0.6666667
#>     class counts:     0    50    50
#>    probabilities: 0.000 0.500 0.500
#>   left son=6 (54 obs) right son=7 (46 obs)
#>   Primary splits:
#>       Petal.Width  < 1.75 to the left,  improve=38.969400, (0 missing)
#>       Petal.Length < 4.75 to the left,  improve=37.353540, (0 missing)
#>       Sepal.Length < 6.15 to the left,  improve=10.686870, (0 missing)
#>       Sepal.Width  < 2.45 to the left,  improve= 3.555556, (0 missing)
#>   Surrogate splits:
#>       Petal.Length < 4.75 to the left,  agree=0.91, adj=0.804, (0 split)
#>       Sepal.Length < 6.15 to the left,  agree=0.73, adj=0.413, (0 split)
#>       Sepal.Width  < 2.95 to the left,  agree=0.67, adj=0.283, (0 split)
#>
#> Node number 6: 54 observations
#>   predicted class=versicolor  expected loss=0.09259259  P(node) =0.36
#>     class counts:     0    49     5
#>    probabilities: 0.000 0.907 0.093
#>
#> Node number 7: 46 observations
#>   predicted class=virginica   expected loss=0.02173913  P(node) =0.3066667
#>     class counts:     0     1    45
#>    probabilities: 0.000 0.022 0.978
plotcp(iris_tree)
``````

``````printcp(iris_tree)
#>
#> Classification tree:
#> rpart(formula = Species ~ Sepal.Length + Sepal.Width + Petal.Length +
#>     Petal.Width, data = iris, method = "class")
#>
#> Variables actually used in tree construction:
#> [1] Petal.Length Petal.Width
#>
#> Root node error: 100/150 = 0.66667
#>
#> n= 150
#>
#>     CP nsplit rel error xerror     xstd
#> 1 0.50      0      1.00   1.18 0.050173
#> 2 0.44      1      0.50   0.74 0.061232
#> 3 0.01      2      0.06   0.10 0.030551
``````

This topic was automatically closed 42 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.