creating new columns in data.table by renaming

I have created a function to calculate log of selected variables and rename them. The function however works for 1 data set and does not work for the other. Please help. The working example is attached below.

packages<-c("tidyverse","data.table","readxl","datapasta")
sapply(packages,library,character.only=T)
#> 
#> Attaching package: 'data.table'
#> The following objects are masked from 'package:dplyr':
#> 
#>     between, first, last
#> The following object is masked from 'package:purrr':
#> 
#>     transpose
#> $tidyverse
#>  [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"     "tidyr"    
#>  [7] "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics"  "grDevices"
#> [13] "utils"     "datasets"  "methods"   "base"     
#> 
#> $data.table
#>  [1] "data.table" "forcats"    "stringr"    "dplyr"      "purrr"     
#>  [6] "readr"      "tidyr"      "tibble"     "ggplot2"    "tidyverse" 
#> [11] "stats"      "graphics"   "grDevices"  "utils"      "datasets"  
#> [16] "methods"    "base"      
#> 
#> $readxl
#>  [1] "readxl"     "data.table" "forcats"    "stringr"    "dplyr"     
#>  [6] "purrr"      "readr"      "tidyr"      "tibble"     "ggplot2"   
#> [11] "tidyverse"  "stats"      "graphics"   "grDevices"  "utils"     
#> [16] "datasets"   "methods"    "base"      
#> 
#> $datapasta
#>  [1] "datapasta"  "readxl"     "data.table" "forcats"    "stringr"   
#>  [6] "dplyr"      "purrr"      "readr"      "tidyr"      "tibble"    
#> [11] "ggplot2"    "tidyverse"  "stats"      "graphics"   "grDevices" 
#> [16] "utils"      "datasets"   "methods"    "base"

data<-data.table::data.table(
        Year = c("1960Q1", "1960Q2", "1960Q3", "1960Q4", "1961Q1", "1961Q2"),
        CONS = c(6.028278, 6.042633, 6.073044, 6.104793, 6.12905, 6.126869),
         INC = c(6.111467, 6.142037, 6.184149, 6.200509, 6.232448, 6.253829),
         INV = c(5.192957,5.187386,5.220356,
                 5.25749499999999,5.351858,5.308268)
)

#function to calculate log of selected variables
log_fn<-function(DT,cols){
        for(col in cols){
                new_name<-paste0("ln_",col)
                DT[,(new_name):= log(get(col))]
        }
}


# variable selection
cols=data %>% select_if(is.numeric) %>% names() #select columns

log_fn(data,cols) #calculate log of selected variables


#second data

data2<-data.table::data.table(
                                check.names = FALSE,
                            `CALENDAR YEAR` = c("1996Q2",
                                                "1996Q3","1996Q4","1997Q1",
                                                "1997Q2","1997Q3"),
                `Remittance.Billion Rs...3` = c("9177",
                                                "9759","10792","14481","10706",
                                                "13138"),
             `GDP_mp (2011).Billion Rs...5` = c("7731.664146858664","7128.9166154954801",
                                                "8874.8025851047496",
                                                "9006.0042667434409","7983.75766925485",
                                                "7473.9778343319458"),
                  `Oil Price.US$ per Barel` = c("19.476666666666667","20.543333333333333",
                                                "23.16","21.17",
                                                "18.053333333333335","18.523333333333333"),
                        `Exchange rate.US$` = c("34.744",
                                                "35.642966666666602",
                                                "35.737033333333301","35.875833333333297",
                                                "35.8130666666666",
                                                "36.0296666666666"),
                         `REER.Trade based` = c("97.23",
                                                "96.42","96.513333333333307",
                                                "97.99","100.73","102.743333333333"),
                                  NEER....9 = c("103.543333333333","100.61","100.9",
                                                "102.883333333333",
                                                "105.11666666666601","107.023333333333"),
                            `Libor.3-month` = c("5.5197240983606557","5.5965146153846153",
                                                "5.5322879687500004",
                                                "5.5657024590163937","5.8305438095238094",
                                                "5.731850923076923"),
                             `T-bill....11` = c("12.438499999999999","9.1639333329999992",
                                                "8.2095333329999995",
                                                "7.5952333330000004","5.3354666670000004",
                                                "6.1689666670000003"),
               `IR_differential.14-90 days` = c("6.9187759016393438","3.567418717615384",
                                                "2.6772453642499991",
                                                "2.0295308739836067","-0.49507714252380897",
                                                "0.43711574392307728"),
  `US GDP.Billions of Chained 2012 Dollars` = c("10998.322",
                                                "11096.976000000001",
                                                "11212.205","11284.587",
                                                "11472.137000000001","11615.636"),
               `US_unemployment rate....14` = c("5.5",
                                                "5.2666666666666666",
                                                "5.333333333333333","5.2333333333333334","5",
                                                "4.8666666666666671")
)

data2<-data2 %>% mutate(
        across(-`CALENDAR YEAR`, as.numeric))
#columns to calculate log
vars<-data2 %>% 
        select(-c(`CALENDAR YEAR`,`IR_differential.14-90 days`,
                  `Libor.3-month`,`US_unemployment rate....14`,
                  `T-bill....11`)) %>%
        names()

#calculating log of the above variables
log_fn(data2,cols = vars)
#> Warning in `[.data.table`(DT, , `:=`((new_name), log(get(col)))):
#> Invalid .internal.selfref detected and fixed by taking a (shallow) copy of the
#> data.table so that := can add this new column by reference. At an earlier point,
#> this data.table has been copied by R (or was created manually using structure()
#> or similar). Avoid names<- and attr<- which in R currently (and oddly) may
#> copy the whole data.table. Use set* syntax instead to avoid copying: ?set, ?
#> setnames and ?setattr. If this message doesn't help, please report your use case
#> to the data.table issue tracker so the root cause can be fixed or this message
#> improved.
Created on 2020-08-18 by the reprex package (v0.3.0)

If you change the order of the steps for data2, does this do what you expect?

# data2 as originally defined
# now define your vars
vars <- data2 %>% 
  select(-c(`CALENDAR YEAR`,`IR_differential.14-90 days`,
            `Libor.3-month`,`US_unemployment rate....14`,
            `T-bill....11`)) %>%
  names()  

# If I understand, you only need to apply your function to the cols in vars?
#  perform as.numeric on the 'vars' columns

data2[, (vars) := lapply(.SD, as.numeric), .SDcols = vars]
# try the function and check
log_fn(data2, cols = vars)
data2[] 

I think the key thing, is to try and avoid the

data2 <- data2 %>% 

pattern with data.table

1 Like

That’s so cool John.. I almost forgot the .SD argument of data.table.. Thanks a lot.

Hi John, one more question in this regard, what is the data.table way to convert all columns except the first column to numeric?


data2[,.SD, .SDcols = !c('CALENDAR YEAR')]

Should work

The problem is that removes my calender year column. I want that to be retained as such and all other columns to be converted to numeric

data2[, (vars) := lapply(.SD, as.numeric), .SDcols = !c('CALENDAR YEAR') ]

Thanks John.. however i dont want to define vars either.. I want it to be done without defining vars. convert all columns of data.table to numeric except Calender Year. This is what i want to do

See below
starting with your data2 data frame

cols <- data2 %>% select(-`CALENDAR YEAR`) %>% names()

vars <- data2 %>% 
  select(-c(`CALENDAR YEAR`,`IR_differential.14-90 days`,
            `Libor.3-month`,`US_unemployment rate....14`,
            `T-bill....11`)) %>%
  names() 

# set all cols except `CALENDAR YEAR` to numeric
for (j in cols) {
  
  set(data2, j = j, value = as.numeric(data2[[j]]))
}

# apply your function. 
# if you don't want to define vars, then pass in character vector
log_fn(data2, cols = vars)
data2[]  # use `[]` to update data2 in the console. Or hit refresh in Environment pane in RStudio

1 Like

thats helpful John!!

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.