percentage prediction

I have this code working, which saves the prediction with “H” and NH. ”I would like to know, how can I save the prediction in percentage instead of“ H ”and“ NH ”?

Regards!

Code:

options(java.parameters = "-Xmx15g")
setwd("C:/hom")
xl_data_tmp = read.csv("train.csv", header = TRUE, sep=",", dec=",")

xl_data_tmp$y <- as.factor(xl_data_tmp$y)
xl_data_tmp$x3 <- as.factor(xl_data_tmp$x3)
#xl_data_tmp$x3 <- as.factor(xl_data_tmp$x3)


#normalizar indices numéricos entre 0 y 1 si es necesario
#xl_data_tmp$x1 = xl_data_tmp$x1
xl_data_tmp$x1 = (xl_data_tmp$x1-min(xl_data_tmp$x1))/(max(xl_data_tmp$x1)-min(xl_data_tmp$x1))
xl_data_tmp$x2 = (xl_data_tmp$x2-min(xl_data_tmp$x2))/(max(xl_data_tmp$x2)-min(xl_data_tmp$x2))
xl_data_tmp$x4 = (xl_data_tmp$x4-min(xl_data_tmp$x4))/(max(xl_data_tmp$x4)-min(xl_data_tmp$x4))
#xl_data_tmp$x4 = xl_data_tmp$x4
xl_data_tmp$x5 = (xl_data_tmp$x5-min(xl_data_tmp$x5))/(max(xl_data_tmp$x5)-min(xl_data_tmp$x5))
#xl_data_tmp$x5 = xl_data_tmp$x5
#xl_data_tmp$x6 = xl_data_tmp$x6
xl_data_tmp$x6 = (xl_data_tmp$x6-min(xl_data_tmp$x6))/(max(xl_data_tmp$x6)-min(xl_data_tmp$x6))
#xl_data_tmp$x7 = xl_data_tmp$x7
xl_data_tmp$x7 = (xl_data_tmp$x7-min(xl_data_tmp$x7))/(max(xl_data_tmp$x7)-min(xl_data_tmp$x7))
xl_data_tmp$x8 = (xl_data_tmp$x8-min(xl_data_tmp$x8))/(max(xl_data_tmp$x8)-min(xl_data_tmp$x8))
#xl_data_tmp$x9 = (xl_data_tmp$x9-min(xl_data_tmp$x9))/(max(xl_data_tmp$x9)-min(xl_data_tmp$x9))
#xl_data_tmp$x8 = xl_data_tmp$x8
#xl_data_tmp$x9 = xl_data_tmp$x9
#xl_data_tmp$x10 = xl_data_tmp$x10
xl_data_tmp$x10 = (xl_data_tmp$x10-min(xl_data_tmp$x10))/(max(xl_data_tmp$x10)-min(xl_data_tmp$x10))
#xl_data_tmp$x11 = xl_data_tmp$x11
xl_data_tmp$x11 = (xl_data_tmp$x11-min(xl_data_tmp$x11))/(max(xl_data_tmp$x11)-min(xl_data_tmp$x11))
#xl_data_tmp$x12 = xl_data_tmp$x12
xl_data_tmp$x12 = (xl_data_tmp$x12-min(xl_data_tmp$x12))/(max(xl_data_tmp$x12)-min(xl_data_tmp$x12))
#xl_data_tmp$x13 = (xl_data_tmp$x13-min(xl_data_tmp$x13))/(max(xl_data_tmp$x13)-min(xl_data_tmp$x13))
#xl_data_tmp$x13 = xl_data_tmp$x13




summary(xl_data_tmp)#mostrar resumen de los datos de entrenamiento


set.seed(1)
sampidx <- c(sample(1:650,519), sample(651:1299,519))
xl_data_tmp<-xl_data_tmp[sampidx,]
print(sampidx)
#sampidx <- c(sample(1:370,296), sample(371:740,296))
train_subset<-xl_data_tmp[sampidx,]
test_subset<-xl_data_tmp[-sampidx,]
print(train_subset)
print(test_subset)
summary(test_subset)


library(pROC)
##librearias necesarias para entrenar
library(nnet)
library(caret)
##configuración de paralelismo
#install.packages("doParallel")
library(doParallel)
numCores <- detectCores()#cantidad de cores
cl = makeCluster(numCores)
registerDoParallel(cl)
##parametros

nn.Grid <- expand.grid(.size=c(1,2,3), .decay=c(0.01,0.1,1))
#crear listas para guardar los resultados de las iteraciones de el entrenamiento de nnet, 
set.seed(1)
nn.seeds <- vector(mode = "list", length = 11) # number of resamples + 1 for final model
for(i in 1:10) nn.seeds[[i]] <- sample.int(n=1000, 9) # 9 is the # of tuning parameter combinations
nn.seeds[[11]] <- 1 # for the last model
remove(i)
nn.seeds
#configuración de los ciclos de entrenamiento nnet y el retorno en la salida del resultado
nn.Control <- trainControl(method = "repeatedcv", # use N-fold cross validation
                           number = 5, # the number of folds
                           repeats = 2,
                           classProbs = TRUE, summaryFunction = twoClassSummary,
                           seeds = nn.seeds)
#Fit model
model.nn <- train(y ~ .,
                  data=train_subset,
                  method='nnet',
                  maxit = 500,
                  linout = FALSE,
                  trControl = nn.Control,
                  tuneGrid = nn.Grid,
                  metric = "ROC",
                  MaxNWts = 1000000,
                  importance=TRUE,
                  na.action=na.exclude,
                  allowParallel = TRUE)

stopCluster(cl)
remove(cl)
registerDoSEQ()

varImp(model.nn)#importancia de cada variable, en el modelo entrenado
print(model.nn)

plot(model.nn, metric = "ROC")#presentar la relación del ROC, decaimiento y tamaño
remove(nn.Control, nn.Grid, nn.seeds)


require("NeuralNetTools")
garson(model.nn)
#plot_data<-garson(model.nn, bar_plot = FALSE)$rel_imp
#plot_data_names<-garson(model.nn)$x_names
#print(plot_data)
#print(plot_data_names)
olden(model.nn)
plot_data2<-garson(model.nn, bar_plot = FALSE)
plot_data3<-olden(model.nn, bar_plot = FALSE)
print(plot_data2)
plot_data3
#escribir el resultado a un archivo
library("xlsx")
write.xlsx(plot_data2, file = "garson.xlsx", sheetName = "resultado", append = FALSE)
write.xlsx(plot_data3, file = "olden.xlsx", sheetName = "resultado", append = FALSE)

##predecir usando datos externos
#leer datos para realizar una predicción desde archivo
xl_data_test = read.csv("predecir.csv", header = TRUE, sep=",", dec=",")
#conversión categorías
xl_data_test$y  <- as.factor(xl_data_test$y)
xl_data_test$x3 <- as.factor(xl_data_test$x3)
#xl_data_test$x3 <- as.factor(xl_data_test$x3) 


#normalizar valores numéricos
#xl_data_test$x1 = xl_data_test$x1/100
xl_data_test$x1 = (xl_data_test$x1-min(xl_data_test$x1))/(max(xl_data_test$x1)-min(xl_data_test$x1)) 
xl_data_test$x2 = (xl_data_test$x2-min(xl_data_test$x2))/(max(xl_data_test$x2)-min(xl_data_test$x2))
#xl_data_test$x3 = (xl_data_test$x3-min(xl_data_test$x3))/(max(xl_data_test$x3)-min(xl_data_test$x3)) 
xl_data_test$x4 = (xl_data_test$x4-min(xl_data_test$x4))/(max(xl_data_test$x4)-min(xl_data_test$x4)) 
#xl_data_test$x5 = (xl_data_test$x5-min(xl_data_test$x5))/(max(xl_data_test$x5)-min(xl_data_test$x5))
#xl_data_test$x6 = (xl_data_test$x6-min(xl_data_test$x6))/(max(xl_data_test$x6)-min(xl_data_test$x6)) 
#xl_data_test$x1 = (xl_data_test$x1-min(xl_data_test$x1))/(max(xl_data_test$x1)-min(xl_data_test$x1)) 
#xl_data_test$x1 = xl_data_test$x1
#xl_data_test$x4 = xl_data_test$x4
xl_data_test$x5 = (xl_data_test$x5-min(xl_data_test$x5))/(max(xl_data_test$x5)-min(xl_data_test$x5)) 
#xl_data_test$x5 = xl_data_test$x5
#xl_data_test$x6 = xl_data_test$x6
xl_data_test$x6 = (xl_data_test$x6-min(xl_data_test$x6))/(max(xl_data_test$x6)-min(xl_data_test$x6)) 
#xl_data_test$x7 = xl_data_test$x7
xl_data_test$x7 = (xl_data_test$x7-min(xl_data_test$x7))/(max(xl_data_test$x7)-min(xl_data_test$x7)) 
xl_data_test$x8 = (xl_data_test$x8-min(xl_data_test$x8))/(max(xl_data_test$x8)-min(xl_data_test$x8)) 
#xl_data_test$x8 = xl_data_test$x8
#xl_data_test$x9 = (xl_data_test$x9-min(xl_data_test$x9))/(max(xl_data_test$x9)-min(xl_data_test$x9))
#xl_data_test$x9 = xl_data_test$x9
#xl_data_test$x10 = xl_data_test$x10
xl_data_test$x10 = (xl_data_test$x10-min(xl_data_test$x10))/(max(xl_data_test$x10)-min(xl_data_test$x10)) 
#xl_data_test$x11 = xl_data_test$x11
xl_data_test$x11 = (xl_data_test$x11-min(xl_data_test$x11))/(max(xl_data_test$x11)-min(xl_data_test$x11)) 
#xl_data_test$x12 = xl_data_test$x12
xl_data_test$x12 = (xl_data_test$x12-min(xl_data_test$x12))/(max(xl_data_test$x12)-min(xl_data_test$x12)) 
#xl_data_test$x13 = xl_data_test$x13
#xl_data_test$x13 = (xl_data_test$x13-min(xl_data_test$x13))/(max(xl_data_test$x13)-min(xl_data_test$x13)) 


xl_data_test1 = xl_data_test
#predecir
preds.nn <- predict.train(model.nn, newdata=xl_data_test1, type="raw") # Neural network
preds.nn

##escribir resultados de la prediccion a un excel
library("xlsx")
write.xlsx(cbind(preds.nn,xl_data_test1), file = "salida_prediccion.xlsx", sheetName = "nnet", append = FALSE)

Hello,

In order for us to help you, we'll need a minimal reproducible example where we can see the data or at least some dummy data where the H and NH are present.

Please look at this post for creating such example:

Also, I see that you are repeating a lot of code for normalizing columns in a data frame. Here is a way to write it more efficiently:

library("dplyr")

#Create fake data
myData = data.frame(x = sample(1:50, 10),
                    y = sample(letters, 10),
                    z = sample(1:50, 10))

#Create normalize function
normalize <- function(data){
  (data - min(data))/(max(data) - min(data))
}

#Apply normalize function to all columns of interest at once
#Tidyverse implementation ...
myData = myData %>% mutate_at(c("x", "z"), normalize)

# ... or standard R implementation
myData[,c("x", "z")] = apply(myData[,c("x", "z")], 2, normalize)

Good luck,
PJ

1 Like

Hi, thank you very much for responding. I am new and I had not realized that you can attach files. I attach the training file "train" and the file where I want to predict "predecir".
Thank you!

https://mega.nz/#!AtglyISb!q7NoCQ_VRqKJVWA-NsmFpYTo9f1eq4z-dd7MO8cNZCs

Hi,

I cleaned up you code a bit and found the solution to your question:

library(dplyr)
library(pROC)
library(xlsx)
##librearias necesarias para entrenar
library(nnet)
library(caret)
library(NeuralNetTools)
##configuración de paralelismo
library(doParallel)

options(java.parameters = "-Xmx15g")

xl_data_tmp = read.csv("train.csv", header = TRUE, sep=",", dec=",")

xl_data_tmp$y <- as.factor(xl_data_tmp$y)
xl_data_tmp$x3 <- as.factor(xl_data_tmp$x3)

#Create normalize function
normalize <- function(data){
  (data - min(data))/(max(data) - min(data))
}

#normalizar indices numéricos entre 0 y 1 si es necesario
xl_data_tmp = xl_data_tmp %>% mutate_at(c("x1", "x2", "x4", "x5", "x6", "x7", "x8", "x10", "x11", "x12"), normalize)

summary(xl_data_tmp)#mostrar resumen de los datos de entrenamiento


set.seed(1)
sampidx <- c(sample(1:650,519), sample(651:1299,519))
xl_data_tmp<-xl_data_tmp[sampidx,]
print(sampidx)

train_subset<-xl_data_tmp[sampidx,]
test_subset<-xl_data_tmp[-sampidx,]

numCores <- detectCores()#cantidad de cores
cl = makeCluster(numCores)
registerDoParallel(cl)
##parametros

nn.Grid <- expand.grid(.size=c(1,2,3), .decay=c(0.01,0.1,1))
#crear listas para guardar los resultados de las iteraciones de el entrenamiento de nnet, 
set.seed(1)
nn.seeds <- vector(mode = "list", length = 11) # number of resamples + 1 for final model
for(i in 1:10) nn.seeds[[i]] <- sample.int(n=1000, 9) # 9 is the # of tuning parameter combinations
nn.seeds[[11]] <- 1 # for the last model
remove(i)
nn.seeds
#configuración de los ciclos de entrenamiento nnet y el retorno en la salida del resultado
nn.Control <- trainControl(method = "repeatedcv", # use N-fold cross validation
                           number = 5, # the number of folds
                           repeats = 2,
                           classProbs = TRUE, summaryFunction = twoClassSummary,
                           seeds = nn.seeds)
#Fit model
model.nn <- train(y ~ .,
                  data=train_subset,
                  method='nnet',
                  maxit = 500,
                  linout = FALSE,
                  trControl = nn.Control,
                  tuneGrid = nn.Grid,
                  metric = "ROC",
                  MaxNWts = 1000000,
                  importance=TRUE,
                  na.action=na.exclude,
                  allowParallel = TRUE)

stopCluster(cl)
remove(cl)
registerDoSEQ()

varImp(model.nn)#importancia de cada variable, en el modelo entrenado
print(model.nn)

plot(model.nn, metric = "ROC")#presentar la relación del ROC, decaimiento y tamaño
remove(nn.Control, nn.Grid, nn.seeds)



garson(model.nn)
olden(model.nn)
plot_data2<-garson(model.nn, bar_plot = FALSE)
plot_data3<-olden(model.nn, bar_plot = FALSE)
print(plot_data2)
plot_data3

#escribir el resultado a un archivo

write.xlsx(plot_data2, file = "garson.xlsx", sheetName = "resultado", append = FALSE)
write.xlsx(plot_data3, file = "olden.xlsx", sheetName = "resultado", append = FALSE)

##predecir usando datos externos
#leer datos para realizar una predicción desde archivo
xl_data_test = read.csv("predecir.csv", header = TRUE, sep=",", dec=",")
#conversión categorías
xl_data_test$y  <- as.factor(xl_data_test$y)
xl_data_test$x3 <- as.factor(xl_data_test$x3)

#normalizar valores numéricos
xl_data_test = xl_data_test %>% mutate_at(c("x1", "x2", "x4", "x5", "x6", "x7", "x8", "x10", "x11", "x12"), normalize)
xl_data_test1 = xl_data_test

#predecir
preds.nn <- predict.train(model.nn, newdata=xl_data_test1, type="prob") # Neural network
percentages = preds.nn %>% transmute(Hpercent = round(H*100, 2), NHpercent = round(NH*100, 2))
head(percentages)

##escribir resultados de la prediccion a un excel
write.xlsx(cbind(preds.nn,xl_data_test1), file = "salida_prediccion.xlsx", sheetName = "nnet", append = FALSE)

Result:

> head(percentages)
  Hpercent NHpercent
1    98.99      1.01
2     7.88     92.12
3     9.06     90.94
4     2.26     97.74
5    99.28      0.72
6    90.04      9.96

All you had to do in was change type="raw" to type="prob" in the predict.train function. You then get two columns with the probability of each output (sum is 1). I rounded it and turned it into percent for a better readability.

Hope this is what you wanted
PJ

1 Like

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.