R thinks my column is factor rather than numeric

I'm trying to do something fairly simple:
for dataframe that looks something like this
a b c
1 0 2 0
2 1 2 0
3 2 1 1
4 0 4 3

I want to create another column that shows for each row in column a, what % of the total of column A that row represents. For example row 1 would be 0, row 2 would be 0.33

I've tried using all the tools I could think of including

data <- mutate(data, a_pct=a/sum(a))
Error: Column data is of unsupported class data.frame

sum(data$a)
Error in Summary.factor(c(8L, 175L, 1L, 140L, 8L, 1L, 1L, 140L, 1L, 1L, :
‘sum’ not meaningful for factors

column_sums <- colSums(data$a, data$b)
Error in colSums(data$a, data$b) :
'x' must be an array of at least two dimensions

I tried converting column a using as.numeric(), but it just created a vector of NA's.

I'm running out of ideas and feeling confused because the data is there when I export it to a csv or when I open the dataframe, not sure why I'm having so much trouble with this.

Is there some text in the columns?

there are some NA's, but I tried using na.rm=TRUE with sum()... other than that no text

Could you run the command dput(data) (with your data table), and then paste the output between a pair of triple backticks (```), like this?

```
<--- paste output of dput(data) here
```

That would help folks understand what might be the issue.

structure(list(ï..Aerospace.Vehicles.and.Defense = structure(c(1L, 
6L, 5L, 1L, 2L, 5L, 1L, 1L, 1L, 1L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 6L, 1L, 3L, 6L, 1L, 1L, 7L), .Label = c("0", 
"1,075", "175", "235", "375", "60", "760"), class = "factor"), 
    Agricultural.Inputs.and.Services = c(96L, 57L, 108L, 10L, 
    20L, 60L, 10L, 10L, 375L, 60L, 10L, 0L, 10L, 20L, 0L, 0L, 
    0L, 0L, 10L, 360L, 10L, 0L, 10L, 0L, 92L, 60L, 30L, 10L, 
    0L), Apparel = c(10L, 92L, 0L, 60L, 10L, 0L, 0L, 60L, 0L, 
    0L, 10L, 0L, 175L, 10L, 0L, 0L, 60L, 60L, 60L, 0L, 10L, 0L, 
    185L, 0L, 70L, 0L, 375L, 10L, 60L), Automotive = structure(c(1L, 
    8L, 16L, 9L, 11L, 1L, 1L, 20L, 1L, 17L, 13L, 5L, 14L, 19L, 
    1L, 1L, 1L, 7L, 12L, 6L, 18L, 7L, 9L, 2L, 4L, 1L, 10L, 3L, 
    15L), .Label = c("0", "1,125", "1,135", "1,370", "1,420", 
    "1,785", "10", "110", "175", "185", "20", "235", "3,340", 
    "375", "385", "40", "435", "60", "810", "90"), class = "factor"), 
    Biopharmaceuticals = structure(c(5L, 4L, 3L, 1L, 2L, 1L, 
    1L, 1L, 1L, 1L, 2L, 1L, 6L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("0", "10", "175", 
    "2,675", "20", "60"), class = "factor"), Business.Services = structure(c(5L, 
    12L, 4L, 11L, 2L, 14L, 9L, 7L, 1L, 6L, 17L, 8L, 9L, 6L, 14L, 
    14L, 15L, 1L, 18L, 20L, 9L, 14L, 19L, 14L, 16L, 3L, 8L, 13L, 
    10L), .Label = c("0", "1,284", "1,287", "1,293", "1,700", 
    "100", "106", "120", "20", "206", "216", "22,346", "266", 
    "30", "40", "402", "425", "468", "550", "955"), class = "factor"), 
    Coal.Mining = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 10L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L), Communications.Equipment.and.Services = c(58L, 
    384L, 115L, 22L, 124L, 0L, 0L, 10L, 0L, 30L, 47L, 70L, 10L, 
    29L, 0L, 10L, 10L, 0L, 105L, 10L, 0L, 0L, 10L, 0L, 42L, 10L, 
    10L, 31L, 10L), Construction.Products.and.Services = structure(c(9L, 
    2L, 16L, 7L, 11L, 8L, 6L, 3L, 1L, 18L, 17L, 15L, 3L, 10L, 
    1L, 3L, 8L, 19L, 14L, 12L, 8L, 1L, 4L, 3L, 5L, 20L, 3L, 14L, 
    13L), .Label = c("0", "1,679", "10", "105", "116", "130", 
    "150", "20", "200", "245", "268", "275", "277", "30", "315", 
    "333", "40", "70", "770", "80"), class = "factor")), class = "data.frame", row.names = c(NA, 
-29L))

Here is a small subset...not sure what the L's mean as they don't show up when I use head() or when I open or export the dataframe...

Thanks, @Erica: The L's indicate integers as opposed to doubles -- are you familiar with those terms?

And could you post the code that gave you the errors. too?

Working with your original example, for some reason your variables have been read as factors, but you can convert them back to numeric so you can make calculations whit them.

library(tidyverse)

sample_df <- data.frame(
           a = as.factor(c(0, 1, 2, 0)),
           b = as.factor(c(2, 2, 1, 4)),
           c = c(0, 0, 1, 3)
)

sample_df %>%
    mutate_if(is.factor, ~parse_number(as.character(.))) %>% 
    mutate(a_pct = a/sum(a))
#>   a b c     a_pct
#> 1 0 2 0 0.0000000
#> 2 1 2 0 0.3333333
#> 3 2 1 1 0.6666667
#> 4 0 4 3 0.0000000

Created on 2020-03-12 by the reprex package (v0.3.0.9001)

2 Likes

Here is one example

data %>% rename(Aerospace.Vehicles.and.Defense=ï..Aerospace.Vehicles.and.Defense) -> data
data %>% mutate(Aerospace.Vehicles.and.Defense_perc=Aerospace.Vehicles.and.Defense/sum(Aerospace.Vehicles.and.Defense)) -> data
Error in Summary.factor(c(1L, 6L, 5L, 1L, 2L, 5L, 1L, 1L, 1L, 1L, 4L, :
‘sum’ not meaningful for factors

This worked for me. Thank you!

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.