Link to dataset :
https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/
library(tidyverse)
## Warning: package 'dplyr' was built under R version 3.5.1
library(tree)
library(plyr)
library(class)
library(rpart)
library(maptree)
library(ROCR)
library(data.table)
spam <- read_table2("data\\spambase.tab", guess_max=2000)
spam <- spam %>%
mutate(y = factor(y, levels=c(0,1), labels=c("good", "spam"))) %>% # label as factors
mutate_at(.vars=vars(-y), .funs=scale) # scale others
calc_error_rate <- function(predicted.value, true.value){
return(mean(true.value!=predicted.value))
}
records = matrix(NA, nrow=3, ncol=2)
colnames(records) <- c("train.error","test.error")
rownames(records) <- c("knn","tree","logistic")
set.seed(1)
test.indices = sample(1:nrow(spam), 1000)
spam.train=spam[-test.indices,]
spam.test=spam[test.indices,]
nfold = 10
set.seed(1)
folds = seq.int(nrow(spam.train)) %>% ## sequential obs ids
cut(breaks = nfold, labels=FALSE) %>% ## sequential fold ids
sample ## random fold ids
##K nearest neighbour test
do.chunk <- function(chunkid, folddef, Xdat, Ydat, k){
train = (folddef!=chunkid)
Xtr = Xdat[train,]
Ytr = Ydat[train]
Xvl = Xdat[!train,]
Yvl = Ydat[!train]
## get classifications for current training chunks
predYtr = knn(train = Xtr, test = Xtr, cl = Ytr, k = k)
## get classifications for current test chunk
predYvl = knn(train = Xtr, test = Xvl, cl = Ytr, k = k)
data.frame(train.error = calc_error_rate(predYtr, Ytr),
val.error = calc_error_rate(predYvl, Yvl))
}
do.chunk(chunkid = 1,folddef = 10,Xdat = spam.train,Ydat = spam.test , k = 10)
Max
October 28, 2018, 9:43pm
2
Can you tell us what the error was, traceback()
output, and the results of sessionInfo()
?
Error in knn(train = Xtr, test = Xtr, cl = Ytr, k = k) :
'train' and 'class' have different lengths
3.
stop("'train' and 'class' have different lengths")
2.
knn(train = Xtr, test = Xtr, cl = Ytr, k = k)
1.
do.chunk(chunkid = 1, folddef = 10, Xdat = spam.train, Ydat = spam.test,
k = 10)
> sessionInfo()
R version 3.4.4 (2018-03-15)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows >= 8 x64 (build 9200)
Matrix products: default
locale:
[1] LC_COLLATE=English_United States.1252 LC_CTYPE=English_United States.1252
[3] LC_MONETARY=English_United States.1252 LC_NUMERIC=C
[5] LC_TIME=English_United States.1252
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] bindrcpp_0.2.2 data.table_1.11.4 ROCR_1.0-7 gplots_3.0.1
[5] maptree_1.4-7 cluster_2.0.6 rpart_4.1-13 class_7.3-14
[9] plyr_1.8.4 tree_1.0-39 forcats_0.3.0 stringr_1.3.1
[13] dplyr_0.7.6 purrr_0.2.5 readr_1.1.1 tidyr_0.8.1
[17] tibble_1.4.2 ggplot2_3.0.0 tidyverse_1.2.1
loaded via a namespace (and not attached):
[1] gtools_3.8.1 tidyselect_0.2.4 haven_1.1.2 lattice_0.20-35
[5] colorspace_1.3-2 yaml_2.2.0 rlang_0.2.2 pillar_1.3.0
[9] glue_1.3.0 withr_2.1.2 modelr_0.1.2 readxl_1.1.0
[13] bindr_0.1.1 munsell_0.5.0 gtable_0.2.0 cellranger_1.1.0
[17] rvest_0.3.2 caTools_1.17.1.1 broom_0.5.0 Rcpp_0.12.18
[21] KernSmooth_2.23-15 scales_1.0.0 backports_1.1.2 gdata_2.18.0
[25] jsonlite_1.5 hms_0.4.2 stringi_1.1.7 grid_3.4.4
[29] cli_1.0.0 tools_3.4.4 bitops_1.0-6 magrittr_1.5
[33] lazyeval_0.2.1 crayon_1.3.4 pkgconfig_2.0.2 xml2_1.2.0
[37] lubridate_1.7.4 assertthat_0.2.0 httr_1.3.1 rstudioapi_0.7
[41] R6_2.2.2 nlme_3.1-131.1 compiler_3.4.4
Max
November 7, 2018, 2:46pm
4
I think that the issue is this:
Xdat = spam.train,Ydat = spam.test
I would expect an argument like Xdat
to be a data set of predictors and Ydat
to be a vector of outcomes. The code
Xtr = Xdat[train,]
Ytr = Ydat[train]
Xvl = Xdat[!train,]
Yvl = Ydat[!train]
indicates that the training and test splits are in the loop but you've done that split in spam. train
and spam.test
.