while running lda algorithm,different output arises with same code and same dataset

library(ggplot2)
library(grid)
library(plyr)
library(reshape)
library(tm)
library(topicmodels)
library(lda)
dataValues<- read.csv('dataset.csv',sep=",")
dataValues=dataValues[sample(nrow(dataValues),size=10000,replace=FALSE),]
set.seed(2)
dim(dataValues)
CorpusObj<- VectorSource(dataValues$skills);
CorpusObj<-Corpus(CorpusObj);
CorpusObj <- tm_map(CorpusObj, tolower)
CorpusObj <- tm_map(CorpusObj, removePunctuation)
CorpusObj <- tm_map(CorpusObj, removeNumbers)
CorpusObj <- tm_map(CorpusObj, removeWords, stopwords("english"))
CorpusObj <- tm_map(CorpusObj, stemDocument, language = "english") 
CorpusObj<-tm_map(CorpusObj,stripWhitespace)
CorpusObj.tdm <- TermDocumentMatrix(CorpusObj, control = list(minWordLength = 3))
dim(dataValues)
inspect(CorpusObj.tdm[1:10,1:10])
findFreqTerms(CorpusObj.tdm, lowfreq=1003)
dim(CorpusObj.tdm)
CorpusObj.tdm.sp <- removeSparseTerms(CorpusObj.tdm, sparse=0.88)
dim(CorpusObj.tdm.sp)
inspect(CorpusObj.tdm.sp[1:10,1:9])
library(wordcloud)
library(RColorBrewer)
mTDM <- as.matrix(CorpusObj.tdm)
v <- sort(rowSums(mTDM),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
pal <- brewer.pal(9, "BuGn")
pal <- pal[-(1:2)]
png("wordcloud.png", width=1280,height=800)
wordcloud(d$word,d$freq, scale=c(8,.3),min.freq=2,max.words=100, random.order=T, rot.per=.15, colors=pal, vfont=c("sans serif","plain"))
dev.off()
CorpusObj.tdm.reformatted <- as.matrix(CorpusObj.tdm.sp)
object.size(CorpusObj.tdm.sp)
object.size(CorpusObj.tdm.reformatted)
CorpusObj.tdm.reformatted = melt(CorpusObj.tdm.reformatted, value.name ="count")
head(CorpusObj.tdm.reformatted)
tail(CorpusObj.tdm.reformatted)
library(ggplot2)
head(CorpusObj.tdm.reformatted)
dim(CorpusObj.tdm.reformatted)
ggplot(CorpusObj.tdm.reformatted, aes(x = Docs, y = Terms, fill = log10( value) )) +
  geom_tile(colour = "white") +
  scale_fill_gradient(high="#FF0000" , low="#FFFFFF")+
  ylab("") +
  theme(panel.background = element_blank()) +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
CorpusObj.tdm.sp <- removeSparseTerms(CorpusObj.tdm, sparse=0.5)
CorpusObj.tdm.sp.df <- as.data.frame(inspect(CorpusObj.tdm.sp ))
nrow(CorpusObj.tdm.sp.df)
library(slam)
CorpusObj.tdm.sp.t <- t(CorpusObj.tdm.sp)
summary(col_sums(CorpusObj.tdm.sp.t))
term_tfidf <- tapply(CorpusObj.tdm.sp.t$v/row_sums(CorpusObj.tdm.sp.t)[CorpusObj.tdm.sp.t$i], CorpusObj.tdm.sp.t$j,mean) * log2(nDocs(CorpusObj.tdm.sp.t)/col_sums(CorpusObj.tdm.sp.t>0))
summary(term_tfidf)
CorpusObj.tdm.sp.t.tdif <- CorpusObj.tdm.sp.t[,term_tfidf>=1.0]
CorpusObj.tdm.sp.t.tdif <- CorpusObj.tdm.sp.t[row_sums(CorpusObj.tdm.sp.t) > 0, ]
summary(col_sums(CorpusObj.tdm.sp.t.tdif))
library(topicmodels)
myModel=builtModel<-LDA(CorpusObj.tdm, 10)
head(topics(myModel))
best.model <- lapply(seq(2, 50, by = 1), function(d){LDA(CorpusObj.tdm.sp.t.tdif, d)})
best.model.logLik <- as.data.frame(as.matrix(lapply(best.model, logLik)))
terms(myModel, 20 )
corpusLDA <- lexicalize(CorpusObj )
require(lda)
set.seed(2)
ldaModel=lda.collapsed.gibbs.sampler(corpusLDA$documents,K=10,vocab=corpusLDA$vocab,burnin=9999,num.iterations=5000,alpha=1,eta=0.1)
top.words <- top.topic.words(ldaModel$topics, 20, by.score=TRUE)
print(top.words)

set.seed(1234)

at the beginning might do the job.
otherwise each calculation uses different random numbers
Peter

3 Likes

it worked..thanks lot

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.