Load data

# load data
training <- read.csv("./data/pml-training.csv", row.names = 1)
testing <- read.csv("./data/pml-testing.csv", row.names = 1)
# remove near zero covariates
nsv <- nearZeroVar(training, saveMetrics = T)
training <- training[, !nsv$nzv]
# remove variables with more than 80% missing values
nav <- sapply(colnames(training), function(x) if(sum(is.na(training[, x])) > 0.8*nrow(training)){return(T)}else{return(F)})
training <- training[, !nav]
# calculate correlations
cor <- abs(sapply(colnames(training[, -ncol(training)]), function(x) cor(as.numeric(training[, x]), as.numeric(training$classe), method = "spearman")))
# plot predictors 
summary(cor)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0015  0.0147  0.0524  0.0862  0.1370  0.3170
plot(training[, names(which.max(cor))], training[, names(which.max(cor[-which.max(cor)]))], col = training$classe, pch = 19, cex = 0.1, xlab = names(which.max(cor)), ylab = names(which.max(cor[-which.max(cor)])))

plot of chunk data_plot

The training set has 19622 samples and 57 potential predictors after filtering.

There doesn’t seem to be any strong predictors that correlates with classe well, so linear regression model is probably not suitable in this case. Boosting and random forests algorithms may generate more robust predictions for our data.

Boosting model

set.seed(123)
boostFit <- train(classe ~ ., method = "gbm", data = training, verbose = F, trControl = trainControl(method = "cv", number = 10))
boostFit
## Stochastic Gradient Boosting 
## 
## 19622 samples
##    57 predictors
##     5 classes: 'A', 'B', 'C', 'D', 'E' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## 
## Summary of sample sizes: 17660, 17660, 17659, 17660, 17658, 17660, ... 
## 
## Resampling results across tuning parameters:
## 
##   interaction.depth  n.trees  Accuracy  Kappa  Accuracy SD  Kappa SD
##   1                  50       0.8       0.8    0.01         0.01    
##   1                  100      0.9       0.9    0.006        0.008   
##   1                  200      0.9       0.9    0.005        0.006   
##   2                  50       1         0.9    0.004        0.005   
##   2                  100      1         1      0.002        0.003   
##   2                  200      1         1      0.002        0.003   
##   3                  50       1         1      0.003        0.004   
##   3                  100      1         1      0.002        0.003   
##   3                  200      1         1      0.001        0.002   
## 
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
## Accuracy was used to select the optimal model using  the largest value.
## The final values used for the model were n.trees = 150,
##  interaction.depth = 3 and shrinkage = 0.1.
plot(boostFit, ylim = c(0.9, 1))

plot of chunk boost_plot

The boosting algorithm generated a good model with accuracy = 0.997.

Random forests model

set.seed(123)
rfFit <- train(classe ~ ., method = "rf", data = training, importance = T, trControl = trainControl(method = "cv", number = 10))
rfFit
## Random Forest 
## 
## 19622 samples
##    57 predictors
##     5 classes: 'A', 'B', 'C', 'D', 'E' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## 
## Summary of sample sizes: 17660, 17661, 17659, 17660, 17658, 17660, ... 
## 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy  Kappa  Accuracy SD  Kappa SD
##   2     1         1      0.003        0.003   
##   40    1         1      5e-04        7e-04   
##   80    1         1      5e-04        6e-04   
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was mtry = 40.
plot(rfFit, ylim = c(0.9, 1))

plot of chunk rf_plot

The random forests algorithm generated a very accurate model with accuracy close to 1. Compared to boosting model, this model generally has better performance in terms of accuracy as we see from the plots.

Final model and prediction

# final model
rfFit$finalModel
## 
## Call:
##  randomForest(x = x, y = y, mtry = param$mtry, importance = ..1) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 40
## 
##         OOB estimate of  error rate: 0.04%
## Confusion matrix:
##      A    B    C    D    E class.error
## A 5580    0    0    0    0   0.0000000
## B    1 3796    0    0    0   0.0002634
## C    0    3 3419    0    0   0.0008767
## D    0    0    1 3214    1   0.0006219
## E    0    0    0    1 3606   0.0002772
# prediction
(prediction <- as.character(predict(rfFit, testing)))
##  [1] "B" "A" "B" "A" "A" "E" "D" "B" "A" "A" "B" "C" "B" "A" "E" "E" "A"
## [18] "B" "B" "B"
# write prediction files
pml_write_files = function(x){
  n = length(x)
  for(i in 1:n){
    filename = paste0("./prediction/problem_id_", i, ".txt")
    write.table(x[i], file = filename, quote = FALSE, row.names = FALSE, col.names = FALSE)
  }
}
pml_write_files(prediction)