Plotting influential points using Cook's

Here is my code:

windows()
par(mfrow=c(3,3))
halfnorm(cook, 5, labs=row, ylab="Cook's Distance")

plot(dfbeta(model.prostate)[,1],ylab="Change in a Coef 1")
abline(h=0)
identify(seq(1:nrow(prostate)), dfbeta(model.prostate)[,1], labels=row.names(prostate))

plot(dfbeta(model.prostate)[,2],ylab="Change in a Coef 2")
abline(h=0)
identify(seq(1:nrow(prostate)), dfbeta(model.prostate)[,2], labels=row.names(prostate))

The first plot does not work, but the second one works.

Here is the error I receive:

Error in identify.default(seq(1:nrow(prostate)), dfbeta(model.prostate)[, :
graphics device closed during call to locator or identify

It is the same code and works for some of my plots (there are a total of eight) and not others. I do not understand what I am doing wrong.
Any help is greatly appreciated.

I can't fully evaluate this without a reprex. See the FAQ: How to do a minimal reproducible example reprex for beginners. However, setting

par(mfrow = c(1,2))

before plotting may work.

I hope I did this correctly...

library("faraway")
require(faraway)

head(prostate)
lcavol lweight age lbph svi lcp gleason pgg45 lpsa
1 -0.5798185 2.7695 50 -1.386294 0 -1.38629 6 0 -0.43078
2 -0.9942523 3.3196 58 -1.386294 0 -1.38629 6 0 -0.16252
3 -0.5108256 2.6912 74 -1.386294 0 -1.38629 7 20 -0.16252
4 -1.2039728 3.2828 58 -1.386294 0 -1.38629 6 0 -0.16252
5 0.7514161 3.4324 62 -1.386294 0 -1.38629 6 0 0.37156
6 -1.0498221 3.2288 50 -1.386294 0 -1.38629 6 0 0.76547

cook <- cooks.distance(out)
par(mfrow=c(1,2))
halfnorm(cook, 3, labs=row, ylab="Cook's Distance")

model.prostate.x <- lm(lpsa ~ lcavol + lweight + age + lbph + svi + lcp + gleason + pgg45, prostate, subset=(cook< max(cook)))

sumary(model.prostate.x)
sumary(model.prostate)

windows()

par(mfrow=c(3,3))
halfnorm(cook, 5, labs=row, ylab="Cook's Distance")
plot(dfbeta(model.prostate)[,1],ylab="Change in a Coef 1")
abline(h=0)
identify(seq(1:nrow(prostate)), dfbeta(model.prostate)[,1], labels=row.names(prostate))

plot(dfbeta(model.prostate)[,2],ylab="Change in a Coef 2")
abline(h=0)
identify(seq(1:nrow(prostate)), dfbeta(model.prostate)[,2], labels=row.names(prostate))

I figured out the problem is my identity statement.
If I delete:
identify(seq(1:nrow(prostate)), dfbeta(model.prostate)[,1], labels=row.names(prostate))

All of my plots come out nice. However, I need to identify the outliers and I have no idea how to do that correctly.

OK, just that part. The abbreviated data produces NaNs, which requires some contortions.

dta <- data.frame(
  lcavol =
    c(-0.5798185, -0.9942523, -0.5108256, -1.2039728, 0.7514161, -1.0498221),
  lweight =
    c(2.7695, 3.3196, 2.6912, 3.2828, 3.4324, 3.2288),
  age =
    c(50, 58, 74, 58, 62, 50),
  lbph =
    c(-1.386294, -1.386294, -1.386294, -1.386294, -1.386294, -1.386294),
  svi =
    c(0, 0, 0, 0, 0, 0),
  lcp =
    c(-1.38629, -1.38629, -1.38629, -1.38629, -1.38629, -1.38629),
  gleason =
    c(6, 6, 7, 6, 6, 6),
  pgg45 =
    c(0, 0, 20, 0, 0, 0),
  lpsa =
    c(-0.43078, -0.16252, -0.16252, -0.16252, 0.37156, 0.76547)
)

model.prostate <- lm(lpsa ~ lcavol + lweight + age + lbph + svi + lcp + gleason + pgg45, data = dta)

summary(model.prostate)
#> 
#> Call:
#> lm(formula = lpsa ~ lcavol + lweight + age + lbph + svi + lcp + 
#>     gleason + pgg45, data = dta)
#> 
#> Residuals:
#>          1          2          3          4          5          6 
#> -5.021e-03 -1.014e-01  3.404e-17  8.396e-02  1.162e-02  1.083e-02 
#> 
#> Coefficients: (4 not defined because of singularities)
#>              Estimate Std. Error t value Pr(>|t|)  
#> (Intercept) -24.14927    3.77231  -6.402   0.0986 .
#> lcavol        0.36697    0.10008   3.667   0.1695  
#> lweight       2.94551    0.42274   6.968   0.0907 .
#> age          -0.13795    0.02264  -6.093   0.1036  
#> lbph               NA         NA      NA       NA  
#> svi                NA         NA      NA       NA  
#> lcp                NA         NA      NA       NA  
#> gleason       3.77937    0.61410   6.154   0.1025  
#> pgg45              NA         NA      NA       NA  
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 0.1327 on 1 degrees of freedom
#> Multiple R-squared:  0.982,  Adjusted R-squared:  0.9102 
#> F-statistic: 13.68 on 4 and 1 DF,  p-value: 0.1998

cook <- cooks.distance(model.prostate)

find_outliers <- function(x) {
  a <- !is.na(x[x[!is.nan(x)] < max(x[!is.nan(x)])]) == TRUE
  as.numeric(names(which(a == TRUE)))
}

model.prostate.x <- lm(lpsa ~ lcavol + lweight + age + lbph + svi + lcp + gleason + pgg45, data = dta[find_outliers(cook),])

summary(model.prostate.x)
#> 
#> Call:
#> lm(formula = lpsa ~ lcavol + lweight + age + lbph + svi + lcp + 
#>     gleason + pgg45, data = dta[find_outliers(cook), ])
#> 
#> Residuals:
#> ALL 3 residuals are 0: no residual degrees of freedom!
#> 
#> Coefficients: (6 not defined because of singularities)
#>             Estimate Std. Error t value Pr(>|t|)
#> (Intercept)   9.4807         NA      NA       NA
#> lcavol        0.4843         NA      NA       NA
#> lweight      -2.7599         NA      NA       NA
#> age               NA         NA      NA       NA
#> lbph              NA         NA      NA       NA
#> svi               NA         NA      NA       NA
#> lcp               NA         NA      NA       NA
#> gleason           NA         NA      NA       NA
#> pgg45             NA         NA      NA       NA
#> 
#> Residual standard error: NaN on 0 degrees of freedom
#> Multiple R-squared:      1,  Adjusted R-squared:    NaN 
#> F-statistic:   NaN on 2 and 0 DF,  p-value: NA

Thank you, so much for your assistance!

1 Like

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.