Target Encoding with Regularization - How To

tlg265 · September 19, 2019, 11:11am

Target Encoding with Regularization - How To

library("dplyr")
library("rgl")
library("plot3D")

printf = function(...) cat(sprintf(...))

# BEGIN OF SUPPORT FUNCTIONS
# Reference:
# https://github.com/skranz/dplyrExtras/blob/master/R/s_dplyr.r
s_filter = function(.data, ...) { eval.string.dplyr(.data,"filter", ...) }
s_select = function(.data, ...) { eval.string.dplyr(.data,"select", ...) }
s_arrange = function(.data, ...) { eval.string.dplyr(.data,"arrange", ...) }
s_mutate = function(.data, ...) { eval.string.dplyr(.data,"mutate", ...) }
s_summarise = function(.data, ...) { eval.string.dplyr(.data,"summarise", ...) }
s_group_by = function(.data, ...) { eval.string.dplyr(.data,"group_by", ...) }
eval.string.dplyr = function(.data, .fun.name, ...) {
  args = list(...)
  args = unlist(args)
  code = paste0(.fun.name,"(.data,", paste0(args, collapse=","), ")")
  df = eval(parse(text=code,srcfile=NULL))
  df  
}
# END OF SUPPORT FUNCTIONS

Imagine there is a city like in the image below. This city is divided into multiple areas: zip codes. The altitude of this city changes depending on the location. Each zip code has 10 x 10 = 100 houses.

regions_per_dimension = 10
x1_plot = seq(0, 100, length = regions_per_dimension + 1)
x2_plot = seq(0, 100, length = regions_per_dimension + 1)
x1 = seq(0, 100, length = 100)
x2 = seq(0, 100, length = 100)
h = function(x1, x2) 10 * sin((x1 - 50) / 15) + 10 * cos((x2 - 50) / 15) + (10 + 10)

persp3D(
  x1_plot,
  x2_plot,
  outer(x1_plot, x2_plot, h),
  xlab = "X1",
  ylab = "X2",
  zlab = "Altitude",
  ticktype = "detailed",
  theta = 30,
  phi = 20,
  expand = 0.5
)

On the image below you have better view about the different zip codes on the city, where each square corresponds to one zip code:

persp3D(
  x1_plot,
  x2_plot,
  outer(x1_plot, x2_plot, h),
  xlab = "X1",
  ylab = "X2",
  zlab = "",
  ticktype = "detailed",
  theta = 0,
  phi = 90,
  expand = 0
)

Let’s get the observations on a dataframe.

As you can see each zip code will be formatted as: 1[X1][X2] , where: X1 , X2 : { 01 , …, 10 } (both with two digits).

len = 100
x1 = x2 = alt = zip = rep(0, len  ^ 2)

for (x2i in 0:(len-1)) {
  for (x1i in 0:(len-1)) {
    i = x2i * len + x1i + 1
    x1[i] = x1i
    x2[i] = x2i
    alt[i] = h(x1i, x2i)
    zip[i] = sprintf("%d", 10000 + (floor(x1i/10)+1)*100 + (floor(x2i/10)+1))
  }
}
df = data.frame(altitude = alt, x1 = x1, x2 = x2, zip = zip)
df$zip = as.factor(df$zip)
df = df %>% arrange(zip) # just putting observations (houses) with same zip together
# View(df)

str(df)

## 'data.frame':    10000 obs. of  4 variables:
##  $ altitude: num  12.09 11.43 10.77 10.1 9.43 ...
##  $ x1      : num  0 1 2 3 4 5 6 7 8 9 ...
##  $ x2      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ zip     : Factor w/ 100 levels "10101","10102",..: 1 1 1 1 1 1 1 1 1 1 ...

summary(df)

##     altitude              x1              x2             zip      
##  Min.   : 0.00461   Min.   : 0.00   Min.   : 0.00   10101  : 100  
##  1st Qu.:12.06935   1st Qu.:24.75   1st Qu.:24.75   10102  : 100  
##  Median :19.64418   Median :49.50   Median :49.50   10103  : 100  
##  Mean   :19.44756   Mean   :49.50   Mean   :49.50   10104  : 100  
##  3rd Qu.:26.63075   3rd Qu.:74.25   3rd Qu.:74.25   10105  : 100  
##  Max.   :39.99574   Max.   :99.00   Max.   :99.00   10106  : 100  
##                                                     (Other):9400

Here there is a random sample of our observations:

Getting a seed first:

# seed_sample = (1:1000000)[sample(1000000,1)]
seed_sample = 636889
printf("seed_sample: %d", seed_sample)

## seed_sample: 636889

Getting the sample of the dataset:

set.seed(seed_sample)
df_preview = df[sample(1:nrow(df), 20),]
rownames(df_preview) = NULL
print(df_preview)

##     altitude x1 x2   zip
## 1  18.172164 15 35 10204
## 2  23.217744 39 52 10406
## 3  26.650505  1 65 10107
## 4  37.851856 73 60 10807
## 5  26.627905  8 51 10106
## 6  36.384171 62 44 10705
## 7   9.467518  4 94 10110
## 8  33.967755 88 59 10906
## 9   4.453650 41  5 10501
## 10  6.745105 45 98 10510
## 11  3.070652 36 10 10402
## 12 35.357103 75 35 10804
## 13 10.835880 13 78 10208
## 14 37.597349 77 40 10805
## 15  3.751946 13  3 10201
## 16 13.439950  1 87 10109
## 17 34.826192 74 66 10807
## 18 24.482611 86 77 10908
## 19 17.024661  8 27 10103
## 20  9.762171 28 26 10303

GOAL

Use an appropriate encoding for the zip code.

Since the zip code is a discrete variable with a high cardinality (on this dummy example only 100 levels, but on a real example could be much more), I want to use Target Encoding (aka: Mean Encoding ) for it.

The following is one possible implementation:

myeval = function(code, envir = NULL) {
  eval(parse(text = code), envir = envir)
}
regularize_encoding = function(dataset, varname, encoding) {
  # ...
  # TODO / TBD / ATTENTION: This function is pending to be implemented
  # ...
  return (encoding)
}
target_encoding_with_regularization = function(dataset, varname) {
  encoding = dataset %>% s_group_by("zip") %>% summarize(encoding = mean(altitude)) %>% arrange(desc(encoding))
  encoding = data.frame(encoding)
  rownames(encoding) = encoding[,1]
  encoding = myeval(
    sprintf("encoding %%>%% select(-%s)", varname),
    envir = environment()
  )
  encoding = regularize_encoding(dataset, varname, encoding)
  dataset = myeval(
    sprintf('dataset %%>%% mutate(%s_encoded = encoding[%s, "encoding"])', varname, varname),
    envir = environment()
  )
  return (list(
    dataset = dataset,
    encoding = encoding
  ))
}

dataset_encoding = target_encoding_with_regularization(df, "zip")

This is the encoding we got:

print(head(dataset_encoding$encoding, 10))

##       encoding
## 10806 39.17764
## 10805 38.96352
## 10706 37.45907
## 10705 37.24496
## 10906 36.70025
## 10905 36.48613
## 10807 35.37535
## 10804 34.82470
## 10707 33.65678
## 10704 33.10613

print(tail(dataset_encoding$encoding, 10))

##        encoding
## 10409 5.0252593
## 10402 4.5520796
## 10309 3.7229025
## 10210 3.4631097
## 10201 3.3707622
## 10302 3.2497227
## 10410 1.8986943
## 10401 1.8063469
## 10310 0.5963374
## 10301 0.5039900

And this is a sample of the dataset with: zip_encoded :

set.seed(seed_sample)
df_preview = dataset_encoding$dataset[sample(1:nrow(dataset_encoding$dataset), 20),]
rownames(df_preview) = NULL
print(df_preview)

##     altitude x1 x2   zip zip_encoded
## 1  18.172164 15 35 10204   32.066372
## 2  23.217744 39 52 10406   22.296000
## 3  26.650505  1 65 10107   35.375349
## 4  37.851856 73 60 10807   11.924771
## 5  26.627905  8 51 10106   36.486134
## 6  36.384171 62 44 10705   15.866747
## 7   9.467518  4 94 10110   33.106134
## 8  33.967755 88 59 10906    8.999222
## 9   4.453650 41  5 10501   21.506546
## 10  6.745105 45 98 10510   20.038284
## 11  3.070652 36 10 10402   23.965454
## 12 35.357103 75 35 10804   12.397951
## 13 10.835880 13 78 10208   28.533516
## 14 37.597349 77 40 10805   12.040783
## 15  3.751946 13  3 10201   32.897961
## 16 13.439950  1 87 10109   33.656784
## 17 34.826192 74 66 10807   11.924771
## 18 24.482611 86 77 10908    6.720130
## 19 17.024661  8 27 10103   37.459070
## 20  9.762171 28 26 10303   27.466335

Small issue to solve: The zip code: 10805 was encoded as: 38.96352 , but above is showing up as: 12.040783 . Working on that in the meantime.

My Question:

Is there any library that let me do the same encoding as before and at the same time doing: Regularization to prevent overfitting?

All the documentations out there talk about Regularization is necessary to prevent overfitting, but couldn’t find a really clear example about how to do it in R .

If possible, could you please, provide the lines of code here?, I think this can be done with few lines maybe using the proper libraries (I’m pretty sure there is already some package to achieve this).

Thanks!

JohnMount · September 20, 2019, 12:38am

I would suggest giving our vtreat package a try. It performs impact coding with a cross-validation method that is more resistant to over-fit than regularization. Some examples can be found here.

tlg265 · September 20, 2019, 12:39pm

Thank you @JohnMount, for sure I will check the vtreat package very soon.

In the mean time I'm wondering if there is any chance to apply regularization to the encoding I have so far on my code above. For example, to implement it inside function: regularize_encoding above?

Thanks!

tlg265 · September 20, 2019, 12:47pm

Since for some reason I'm not able to edit my initial post any more, here I put an update for function target_encoding_with_regularization above:

UPDATE 01

target_encoding_with_regularization = function(dataset, varname) {
  encoding = myeval(sprintf("dataset %%>%% subset(!is.na(%s))", varname), envir = environment())
  encoding = encoding %>% s_group_by(varname) %>% summarize(encoding = mean(altitude)) %>% arrange(desc(encoding))
  encoding = data.frame(encoding)
  rownames(encoding) = encoding[,1]
  encoding = myeval(
    sprintf("encoding %%>%% select(-%s)", varname),
    envir = environment()
  )
  encoding = regularize_encoding(dataset, varname, encoding)
  dataset = myeval(
    sprintf('dataset %%>%% mutate(%s_encoded = encoding[as.character(%s), "encoding"])', varname, varname),
    envir = environment()
  )
  return (list(
    dataset = dataset,
    encoding = encoding
  ))
}

system · October 11, 2019, 12:47pm

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.