ANOVA Table for Full Linear Model

AmeliaMN · October 11, 2019, 4:07pm

I'm teaching Multiple Regression again this semester, so I'm once again reminded that the ANOVA output from R is not the same as from other statistical software. A quick google turned up this post from r-help which has essentially my exact question, so I'll quote from it here (data changed for better reproducibility):

I am trying to figure out how to get an ANOVA table that shows the sum of squares. degrees of freedom, etc, for the full model versus the error (aka residuals).

Here is an example of the kind of table I'd like to get:

Analysis of Variance
Source           DF          SS          MS         F        P
Regression       2        0.9715       0.485728    52.13    0.000
Error            29       0.2702       0.009318
Total            31       1.2417

This kind of table is prevalent throughout my statistics textbook, and can apparently be easily obtained in other statistical software tools. I'm not saying this as a gripe, but just as evidence that I'm not trying to do something obviously bizarre.

Here is an example of the only kind of ANOVA table for a single linear model that I've been able to get using R:

library(Stat2Data)
data("NFLStandings2016")
m1 = lm(WinPct ~ PointsFor + PointsAgainst, data = NFLStandings2016)
anova(m1)
#> Analysis of Variance Table
#> 
#> Response: WinPct
#>               Df  Sum Sq Mean Sq F value    Pr(>F)    
#> PointsFor      1 0.41262 0.41262  44.280 2.692e-07 ***
#> PointsAgainst  1 0.55884 0.55884  59.972 1.537e-08 ***
#> Residuals     29 0.27023 0.00932                      
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

^{Created on 2019-10-11 by the reprex package (v0.2.1)}

Is there a way to get an ANOVA table with the full linear regression model considered as a whole rather than broken down into each additional predictor variable? In other words, is there a way to get the former kind of table?

The response on r-help includes quotes like "the logical thing to do would be" and also "if you really want the column of MS, you have a little extra work to do." Is there really no command in R that produces this kind of summary table? My guess is it's hiding in some modeling package and I haven't stumbled across it, but I also wouldn't be surprised if there was some base print method I'm missing.

mattwarkentin · October 11, 2019, 11:59pm

Hi Amelia,

Does the following code achieve what you were hoping for?

library(Stat2Data)
library(rms)
data("NFLStandings2016")

anova(rms::ols(WinPct ~ PointsFor + PointsAgainst, data = NFLStandings2016))

                Analysis of Variance          Response: WinPct 

 Factor        d.f. Partial SS MS          F     P     
 PointsFor      1   0.3895631  0.389563129 41.81 <.0001
 PointsAgainst  1   0.5588400  0.558839981 59.97 <.0001
 REGRESSION     2   0.9714563  0.485728140 52.13 <.0001
 ERROR         29   0.2702337  0.009318404

RussS · October 12, 2019, 12:40am

Here is a modification of the base R anova call. I believe this should work for general lm/aov objects, but I haven't tested it thoroughly. Note that the reg_collapse argument will collapse to Source by default, but by setting it to FALSE you can get the original rows back, just with a Total SS row.

anova_alt = function (object, reg_collapse=TRUE,...) 
{
  if (length(list(object, ...)) > 1L) 
    return(anova.lmlist(object, ...))
  if (!inherits(object, "lm")) 
    warning("calling anova.lm(<fake-lm-object>) ...")
  w <- object$weights
  ssr <- sum(if (is.null(w)) object$residuals^2 else w * object$residuals^2)
  mss <- sum(if (is.null(w)) object$fitted.values^2 else w * 
               object$fitted.values^2)
  if (ssr < 1e-10 * mss) 
    warning("ANOVA F-tests on an essentially perfect fit are unreliable")
  dfr <- df.residual(object)
  p <- object$rank
  if (p > 0L) {
    p1 <- 1L:p
    comp <- object$effects[p1]
    asgn <- object$assign[stats:::qr.lm(object)$pivot][p1]
    nmeffects <- c("(Intercept)", attr(object$terms, "term.labels"))
    tlabels <- nmeffects[1 + unique(asgn)]
    ss <- c(vapply(split(comp^2, asgn), sum, 1), ssr)
    df <- c(lengths(split(asgn, asgn)), dfr)
    if(reg_collapse){
      if(attr(object$terms, "intercept")){
        collapse_p<-2:(length(ss)-1)
        ss<-c(ss[1],sum(ss[collapse_p]),ss[length(ss)])
        df<-c(df[1],sum(df[collapse_p]),df[length(df)])
        tlabels<-c(tlabels[1],"Source")
      } else{
        collapse_p<-1:(length(ss)-1)
        ss<-c(sum(ss[collapse_p]),ss[length(ss)])
        df<-c(df[1],sum(df[collapse_p]),df[length(df)])
        tlabels<-c("Regression")
      }
    }
  }else {
    ss <- ssr
    df <- dfr
    tlabels <- character()
    if(reg_collapse){
      collapse_p<-1:(length(ss)-1)
      ss<-c(sum(ss[collapse_p]),ss[length(ss)])
      df<-c(df[1],sum(df[collapse_p]),df[length(df)])
    }
  }
 
  ms <- ss/df
  f <- ms/(ssr/dfr)
  P <- pf(f, df, dfr, lower.tail = FALSE)
  table <- data.frame(df, ss, ms, f, P)
  table <- rbind(table, 
                 colSums(table))
  table$ms[nrow(table)]<-table$ss[nrow(table)]/table$df[nrow(table)]
  table[length(P):(length(P)+1), 4:5] <- NA
  dimnames(table) <- list(c(tlabels, "Error","Total"), 
                          c("Df","SS", "MS", "F", 
                            "P"))
  if (attr(object$terms, "intercept")){
    table <- table[-1, ]
    table$MS[nrow(table)]<-table$MS[nrow(table)]*(table$Df[nrow(table)])/(table$Df[nrow(table)]-1)
    table$Df[nrow(table)]<-table$Df[nrow(table)]-1
  }
  structure(table, heading = c("Analysis of Variance Table\n"), 
            class = c("anova", "data.frame"))
}

## Warpbreaks example
fm1 <- lm(breaks ~ wool*tension, data = warpbreaks)
anova_alt(fm1)
#> Analysis of Variance Table
#> 
#>        Df    SS     MS      F         P
#> Source  5  3488 697.54 5.8279 0.0002772
#> Error  48  5745 119.69                 
#> Total  53 52018 981.47

Created on 2019-10-11 by the reprex package (v0.3.0)

AmeliaMN · October 14, 2019, 3:02pm

That's close, but still not it. I want a single line for the model, rather than individual lines for each term.

AmeliaMN · October 14, 2019, 3:02pm

Nice, thank you! I'm surprised this function doesn't exist somewhere already, but this is exactly the output I was looking for.

jimhunterbr · October 16, 2019, 7:26pm

Stat teachers all over the world thank you RussS for that function. Where does it come from if you didn't write it from scratch? If you did, great work!

mattwarkentin · October 16, 2019, 7:38pm

The REGRESSION row does correspond to the overall model you wanted, the output just also contains the individual terms, which can be removed.

                Analysis of Variance          Response: WinPct 

 Factor        d.f. Partial SS MS          F     P     
 PointsFor      1   0.3895631  0.389563129 41.81 <.0001
 PointsAgainst  1   0.5588400  0.558839981 59.97 <.0001
 REGRESSION     2   0.9714563  0.485728140 52.13 <.0001
 ERROR         29   0.2702337  0.009318404

Anyway, it seems like the custom function from @RussS is more to your liking.

friendly · October 18, 2019, 8:13pm

Be careful what you wish for. I don't know what "other" software you are comparing with, but IMHO, the test of the overall model w/o tests for individual effects, in general, is either less than useful or misleading.
It is one of the questions to ask, but not the only one.

Ideally, interactive software would provide a + box next to the overall test that could be clicked on to expand, as well as + next to each 2+ df term for a similar purpose.

ejlatour · October 21, 2019, 6:53pm

It seems like the total sum of squares is not calculating correctly. I haven't figured out how to resolve it in your code yet, but I wanted to bring it up. I would expect:

#### NFL example -------------------------------- 

library(Stat2Data)
data("NFLStandings2016")
m1 <- lm(WinPct ~ PointsFor + PointsAgainst, data = NFLStandings2016)
# anova_alt(m1)
# Analysis of Variance Table
# 
#        Df     SS      MS      F          P
# Source  2 0.9715 0.48573 52.126 2.4948e-10
# Error  29 0.2702 0.00932                  
# Total  31 9.2497 0.29838

# Total sum of square 
sum(anova(m1)[, 2])
#> [1] 1.24169


#### Warpbreaks example -------------------------------- 

fm1 <- lm(breaks ~ wool*tension, data = warpbreaks)
# anova_alt(fm1)
# Analysis of Variance Table
# 
#        Df    SS     MS      F         P
# Source  5  3488 697.54 5.8279 0.0002772
# Error  48  5745 119.69                 
# Total  53 52018 981.47 

# Total sum of square 
sum(anova(fm1)[, 2])
#> [1] 9232.815

^{Created on 2019-10-21 by the reprex package (v0.3.0)}

RussS · October 22, 2019, 11:25am

You are absolutely correct. I had a mistake in the total sums of squares calculation because of the presence of the intercept SS. I tried to respect the original function call as much as possible to allow for non-intercept models, but it was tricky how to incorporate that check into the old code so as not to break it. I believe this version is correct. Thanks for double checking the bottom row!

anova_alt = function (object, reg_collapse=TRUE,...) 
{
  if (length(list(object, ...)) > 1L) 
    return(anova.lmlist(object, ...))
  if (!inherits(object, "lm")) 
    warning("calling anova.lm(<fake-lm-object>) ...")
  w <- object$weights
  ssr <- sum(if (is.null(w)) object$residuals^2 else w * object$residuals^2)
  mss <- sum(if (is.null(w)) object$fitted.values^2 else w * 
               object$fitted.values^2)
  if (ssr < 1e-10 * mss) 
    warning("ANOVA F-tests on an essentially perfect fit are unreliable")
  dfr <- df.residual(object)
  p <- object$rank
  if (p > 0L) {
    p1 <- 1L:p
    comp <- object$effects[p1]
    asgn <- object$assign[stats:::qr.lm(object)$pivot][p1]
    nmeffects <- c("(Intercept)", attr(object$terms, "term.labels"))
    tlabels <- nmeffects[1 + unique(asgn)]
    ss <- c(vapply(split(comp^2, asgn), sum, 1), ssr)
    df <- c(lengths(split(asgn, asgn)), dfr)
    if(reg_collapse){
      if(attr(object$terms, "intercept")){
        collapse_p<-2:(length(ss)-1)
        ss<-c(ss[1],sum(ss[collapse_p]),ss[length(ss)])
        df<-c(df[1],sum(df[collapse_p]),df[length(df)])
        tlabels<-c(tlabels[1],"Source")
      } else{
        collapse_p<-1:(length(ss)-1)
        ss<-c(sum(ss[collapse_p]),ss[length(ss)])
        df<-c(df[1],sum(df[collapse_p]),df[length(df)])
        tlabels<-c("Regression")
      }
    }
  }else {
    ss <- ssr
    df <- dfr
    tlabels <- character()
    if(reg_collapse){
      collapse_p<-1:(length(ss)-1)
      ss<-c(sum(ss[collapse_p]),ss[length(ss)])
      df<-c(df[1],sum(df[collapse_p]),df[length(df)])
    }
  }
  
  ms <- ss/df
  f <- ms/(ssr/dfr)
  P <- pf(f, df, dfr, lower.tail = FALSE)
  table <- data.frame(df, ss, ms, f, P)
  table <- rbind(table, 
                 colSums(table))
  if (attr(object$terms, "intercept")){
   table$ss[nrow(table)]<- table$ss[nrow(table)] - table$ss[1]
  }
  table$ms[nrow(table)]<-table$ss[nrow(table)]/table$df[nrow(table)]
  table[length(P):(length(P)+1), 4:5] <- NA
  dimnames(table) <- list(c(tlabels, "Error","Total"), 
                          c("Df","SS", "MS", "F", 
                            "P"))
  if (attr(object$terms, "intercept")){
    table <- table[-1, ]
    table$MS[nrow(table)]<-table$MS[nrow(table)]*(table$Df[nrow(table)])/(table$Df[nrow(table)]-1)
    table$Df[nrow(table)]<-table$Df[nrow(table)]-1
  }
  structure(table, heading = c("Analysis of Variance Table\n"), 
            class = c("anova", "data.frame"))
}

fm1 <- lm(breaks ~ wool*tension, data = warpbreaks)
anova_alt(fm1)
#> Analysis of Variance Table
#> 
#>        Df     SS     MS      F         P
#> Source  5 3487.7 697.54 5.8279 0.0002772
#> Error  48 5745.1 119.69                 
#> Total  53 9232.8 174.20

Created on 2019-10-22 by the reprex package (v0.3.0)

ejlatour · October 22, 2019, 4:54pm

Excellent!! Nice fix! This morning I had something very similar but did the entire fix in the last if (attr(object$terms, "intercept")) statement in your code. I confirm that the last row (and the rest of the table) are what I would expect! Thanks for sharing this with the community!

system · November 12, 2019, 4:54pm

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.