Example 1 is not exactly what you want but it should be enough to get you started with making the function you want.
However discretize() is a function in the arules package that appears to do exactly what you want. It's shown in example 2. You shouldn't build your own statistics functions there are just too many odd's and end's to trip over. arules is a package built by statisticians.
If you feel you need to build your own discretize then you should use arules to check that your function is generating the correct output.
Example 1
tbl <- tibble::tribble(
~Income, ~Loan,
12, T,
13, T,
14, T,
12, F,
14, T,
16, T,
18, F,
33, T,
22, F,
24, F,
46, F,
53, F,
24, F,
19, F,
25, F,
32, T,
33, T,
37, F,
21, F,
25, T
)
disc <- function(tbl, bin_count) {
r <- range(tbl$Income)
bin_size <- (r[2] - r[1]) / bin_count
range_starts <- vector("list", bin_count)
for (i in 1:bin_count) {
range_starts[[i]] <- r[1] + (i - 1) * bin_size
}
for (i in 1:nrow(tbl)) {
for (j in length(range_starts):1) {
if (tbl[i, "Income"] >= range_starts[[j]]) {
tbl[[i, "Income"]] <- j
}
}
}
tbl[1, "Income"] <- as.character(tbl[1, "Income"])
for (i in 1:nrow(tbl)) {
start <- r[1] + (as.integer(tbl[[i, "Income"]]) - 1) * bin_size
tbl[[i, "Income"]] <-
paste("(", start, ",", start + bin_size, "]")
}
tbl
}
disc(tbl, 4)
#> # A tibble: 20 x 2
#> Income Loan
#> <chr> <lgl>
#> 1 ( 12 , 22.25 ] T
#> 2 ( 12 , 22.25 ] T
#> 3 ( 12 , 22.25 ] T
#> 4 ( 12 , 22.25 ] F
#> 5 ( 12 , 22.25 ] T
#> 6 ( 12 , 22.25 ] T
#> 7 ( 12 , 22.25 ] F
#> 8 ( 32.5 , 42.75 ] T
#> 9 ( 12 , 22.25 ] F
#> 10 ( 22.25 , 32.5 ] F
#> 11 ( 42.75 , 53 ] F
#> 12 ( 42.75 , 53 ] F
#> 13 ( 22.25 , 32.5 ] F
#> 14 ( 12 , 22.25 ] F
#> 15 ( 22.25 , 32.5 ] F
#> 16 ( 22.25 , 32.5 ] T
#> 17 ( 32.5 , 42.75 ] T
#> 18 ( 32.5 , 42.75 ] F
#> 19 ( 12 , 22.25 ] F
#> 20 ( 22.25 , 32.5 ] T
Example 2
tbl <- tibble::tribble(
~Income, ~Loan,
12, T,
13, T,
14, T,
12, F,
14, T,
16, T,
18, F,
33, T,
22, F,
24, F,
46, F,
53, F,
24, F,
19, F,
25, F,
32, T,
33, T,
37, F,
21, F,
25, T
)
intervals <- tibble::as_tibble(arules::discretize(tbl$Income, categories = 4))
intervals
#> # A tibble: 20 x 1
#> value
#> <fct>
#> 1 [12.0,22.2)
#> 2 [12.0,22.2)
#> 3 [12.0,22.2)
#> 4 [12.0,22.2)
#> 5 [12.0,22.2)
#> 6 [12.0,22.2)
#> 7 [12.0,22.2)
#> 8 [32.5,42.8)
#> 9 [12.0,22.2)
#> 10 [22.2,32.5)
#> 11 [42.8,53.0]
#> 12 [42.8,53.0]
#> 13 [22.2,32.5)
#> 14 [12.0,22.2)
#> 15 [22.2,32.5)
#> 16 [22.2,32.5)
#> 17 [32.5,42.8)
#> 18 [32.5,42.8)
#> 19 [12.0,22.2)
#> 20 [22.2,32.5)