Traditional text classification

Our first NLP model will use traditional NLP techniques, that is, not deep learning. For the rest of this chapter, when we use the term traditional NLP, we mean approaches that do not use deep learning. The most used method for NLP in traditional NLP classification uses a bag-of-words approach.

We will also use a set of hyperparameters and machine learning algorithms to maximize accuracy:

Feature generation: The features can be term frequency, tf-idf, or binary flags
Preprocessing: We preprocess text data by stemming the words
Remove stop-words: We treat the feature creation, stop-words, and stemming options as hyperparameters
Machine learning algorithm: The script applies three machine learning algorithms to the data (Naive Bayes, SVM, neural network, and random forest)

We train 48 machine learning algorithms on the data in total, and evaluate which model is best. The script for this code is in the Chapter7/classify_text.R folder. The code does not contain any deep learning models, so feel free to skip it if you want. First we load in the necessary libraries and create a function that creates a set of text classification models for a combination of hyperparameters on multiple machine learning algorithms:

library(tm)
require(nnet)
require(kernlab)
library(randomForest)
library(e1071)
options(digits=4)

TextClassification <-function (w,stem=0,stop=0,verbose=1)
{
  df <- read.csv("../data/reuters.train.tab", sep="\t", stringsAsFactors = FALSE)
  df2 <- read.csv("../data/reuters.test.tab", sep="\t", stringsAsFactors = FALSE)
  df <- rbind(df,df2)

  # df <- df[df$y %in% c(3,4),]
  # df$y <- df$y-3
  df[df$y!=3,]$y<-0
  df[df$y==3,]$y<-1
  rm(df2)

  corpus <- Corpus(DataframeSource(data.frame(df[, 2])))
  corpus <- tm_map(corpus, content_transformer(tolower))
  
  # hyperparameters
  if (stop==1)
    corpus <- tm_map(corpus, function(x) removeWords(x, stopwords("english")))
  if (stem==1)
    corpus <- tm_map(corpus, stemDocument)
  if (w=="tfidf")
    dtm <- DocumentTermMatrix(corpus,control=list(weighting=weightTfIdf))
  else if (w=="tf")
    dtm <- DocumentTermMatrix(corpus,control=list(weighting=weightTf))
  else if (w=="binary")
    dtm <- DocumentTermMatrix(corpus,control=list(weighting=weightBin))
  
  # keep terms that cover 95% of the data
  dtm2<-removeSparseTerms(dtm, 0.95)
  m <- as.matrix(dtm2)
  remove(dtm,dtm2,corpus)
  
  data<-data.frame(m)
  data<-cbind(df[, 1],data)
  colnames(data)[1]="y"
  
  # create train, test sets for machine learning
  seed <- 42 
  set.seed(seed) 
  nobs <- nrow(data)
  sample <- train <- sample(nrow(data), 0.8*nobs)
  validate <- NULL
  test <- setdiff(setdiff(seq_len(nrow(data)), train), validate)

Now that we have created a sparse data-frame, we will use 4 different machine learning algorithms on the data: Naive Bayes, SVM, a neural network model and a random forest model. We use 4 machine learning algorithms because, as you see below, the code to call a machine learning algorithm is small compared to the code needed to create the data in the previous section and the code needed to the the NLP. It is almost always a good idea to run multiple machine learning algorithms when possible because no machine learning algorithm is consistently the best.

  
  # create Naive Bayes model
  nb <- naiveBayes(as.factor(y) ~., data=data[sample,])
  pr <- predict(nb, newdata=data[test, ])
  # Generate the confusion matrix showing counts.
  tab<-table(na.omit(data[test, ])$y, pr,
             dnn=c("Actual", "Predicted"))
  if (verbose) print (tab)
  nb_acc <- 100*sum(diag(tab))/length(test)
  if (verbose) print(sprintf("Naive Bayes accuracy = %1.2f%%",nb_acc))
  
  # create SVM model
  if (verbose) print ("SVM")
  if (verbose) print (Sys.time())
  ksvm <- ksvm(as.factor(y) ~ .,
               data=data[sample,],
               kernel="rbfdot",
               prob.model=TRUE)
  if (verbose) print (Sys.time())
  pr <- predict(ksvm, newdata=na.omit(data[test, ]))
  # Generate the confusion matrix showing counts.
  tab<-table(na.omit(data[test, ])$y, pr,
             dnn=c("Actual", "Predicted"))
  if (verbose) print (tab)
  svm_acc <- 100*sum(diag(tab))/length(test)
  if (verbose) print(sprintf("SVM accuracy = %1.2f%%",svm_acc))
  
  # create Neural Network model
  rm(pr,tab)
  set.seed(199)
  if (verbose) print ("Neural Network")
  if (verbose) print (Sys.time())
  nnet <- nnet(as.factor(y) ~ .,
               data=data[sample,],
               size=10, skip=TRUE, MaxNWts=10000, trace=FALSE, maxit=100)
  if (verbose) print (Sys.time())
  pr <- predict(nnet, newdata=data[test, ], type="class")
  # Generate the confusion matrix showing counts.
  tab<-table(data[test, ]$y, pr,
             dnn=c("Actual", "Predicted"))
  if (verbose) print (tab)
  nn_acc <- 100*sum(diag(tab))/length(test)
  if (verbose) print(sprintf("Neural Network accuracy = %1.2f%%",nn_acc))
  
  # create Random Forest model
  rm(pr,tab)
  if (verbose) print ("Random Forest")
  if (verbose) print (Sys.time())
  rf_model<-randomForest(as.factor(y) ~., data=data[sample,])
  if (verbose) print (Sys.time())
  pr <- predict(rf_model, newdata=data[test, ], type="class")
  # Generate the confusion matrix showing counts.
  tab<-table(data[test, ]$y, pr,
             dnn=c("Actual", "Predicted"))
  if (verbose) print (tab)
  rf_acc <- 100*sum(diag(tab))/length(test)
  if (verbose) print(sprintf("Random Forest accuracy = %1.2f%%",rf_acc))
  
  dfParams <- data.frame(w,stem,stop)
  dfParams$nb_acc <- nb_acc
  dfParams$svm_acc <- svm_acc
  dfParams$nn_acc <- nn_acc
  dfParams$rf_acc <- rf_acc
  
  return(dfParams)
}

We now call the function with different hyperparameters in the following code:

dfResults <- TextClassification("tfidf",verbose=1) # tf-idf, no stemming
dfResults<-rbind(dfResults,TextClassification("tf",verbose=1)) # tf, no stemming
dfResults<-rbind(dfResults,TextClassification("binary",verbose=1)) # binary, no stemming

dfResults<-rbind(dfResults,TextClassification("tfidf",1,verbose=1)) # tf-idf, stemming
dfResults<-rbind(dfResults,TextClassification("tf",1,verbose=1)) # tf, stemming
dfResults<-rbind(dfResults,TextClassification("binary",1,verbose=1)) # binary, stemming

dfResults<-rbind(dfResults,TextClassification("tfidf",0,1,verbose=1)) # tf-idf, no stemming, remove stopwords
dfResults<-rbind(dfResults,TextClassification("tf",0,1,verbose=1)) # tf, no stemming, remove stopwords
dfResults<-rbind(dfResults,TextClassification("binary",0,1,verbose=1)) # binary, no stemming, remove stopwords

dfResults<-rbind(dfResults,TextClassification("tfidf",1,1,verbose=1)) # tf-idf, stemming, remove stopwords
dfResults<-rbind(dfResults,TextClassification("tf",1,1,verbose=1)) # tf, stemming, remove stopwords
dfResults<-rbind(dfResults,TextClassification("binary",1,1,verbose=1)) # binary, stemming, remove stopwords

dfResults[, "best_acc"] <- apply(dfResults[, c("nb_acc","svm_acc","nn_acc","rf_acc")], 1, max)
dfResults <- dfResults[order(-dfResults$best_acc),]
dfResults

strResult <- sprintf("Best accuracy score was %1.2f%%. Hyper-parameters: ",dfResults[1,"best_acc"])
strResult <- paste(strResult,dfResults[1,"w"],",",sep="")
strResult <- paste(strResult,
                   ifelse(dfResults[1,"stem"] == 0,"no stemming,","stemming,"))
strResult <- paste(strResult,
                   ifelse(dfResults[1,"stop"] == 0,"no stop word processing,","removed stop words,"))
if (dfResults[1,"best_acc"] == dfResults[1,"nb_acc"]){
  strResult <- paste(strResult,"Naive Bayes model")
} else if (dfResults[1,"best_acc"] == dfResults[1,"svm_acc"]){
  strResult <- paste(strResult,"SVM model")
} else if (dfResults[1,"best_acc"] == dfResults[1,"nn_acc"]){
  strResult <- paste(strResult,"Neural Network model")
}else if (dfResults[1,"best_acc"] == dfResults[1,"rf_acc"]){
  strResult <- paste(strResult,"Random Forest model")
}

print (strResult)

For each combination of hyperparameters, the script saves the best score from the four machine learning algorithms in the best_acc field. Once the training is complete, we can look at the results:

> dfResults
 w stem stop nb_acc svm_acc nn_acc rf_acc best_acc
12 binary    1    1   86.06   95.24   90.52   94.26     95.24
9  binary    0    1   87.71   95.15   90.52   93.72     95.15
10 tfidf     1    1   91.99   95.15   91.05   94.17     95.15
3  binary    0    0   85.98   95.01   90.29   93.99     95.01
6  binary    1    0   84.59   95.01   90.34   93.63     95.01
7  tfidf     0    1   91.27   94.43   94.79   93.54     94.79
11 tf        1    1   77.47   94.61   92.30   94.08     94.61
4  tfidf     1    0   92.25   94.57   90.96   93.99     94.57
5  tf        1    0   75.11   94.52   93.46   93.90     94.52
1  tfidf     0    0   91.54   94.26   91.59   93.23     94.26
2  tf        0    0   75.82   94.03   91.54   93.59     94.03
8  tf        0    1   78.14   94.03   91.63   93.68     94.03

> print (strResult)
[1] "Best accuracy score was 95.24%. Hyper-parameters: binary, stemming, removed stop words, SVM model"

The results are ordered by best results, so here we can can see that our best accuracy overall was 95.24%. The reason for training so many models is that there is no right formula for traditional NLP tasks that's work for most cases, so you should try multiple combinations of preprocessing and different algorithms, as we have done here. For example, if you searched for an example online on text classification, you could find an example that would suggest to use tf-idf and naive bayes. Here, we can see that it is one of the worst performers.