Caret Package

Some pre-processing with preProcess()
Data splitting with
- createDataPartition()
- createResample()
- createTimeSlices()
Training/test functions
- train()
- predict()
Model comparison
- confusionMatrix()

An example of data splitting

library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

# We create a random dataset
rand <- tibble(
    x = rnorm(100),
    y = rbinom(100, 1, .5)
)

in_train <- createDataPartition(
    y = rand$y,
    p = 0.75,
    list = FALSE
)

testing <- rand[-in_train,]

( ran_fit <- train(y ~ x, data = rand[in_train,], method = 'glm') )

## Generalized Linear Model 
## 
## 75 samples
##  1 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 75, 75, 75, 75, 75, 75, ... 
## Resampling results:
## 
##   RMSE      Rsquared    MAE      
##   0.512954  0.02807049  0.5018367

Data Slicing

# Library for the SPAM dataset
library(kernlab)

## 
## Attaching package: 'kernlab'

## The following object is masked from 'package:purrr':
## 
##     cross

## The following object is masked from 'package:ggplot2':
## 
##     alpha

data(spam)
# We use resample rather than createDataPartition()
library(modelr)

## 
## Attaching package: 'modelr'

## The following object is masked from 'package:broom':
## 
##     bootstrap

spam %>%
    resample_partition(c(test = .25, train = .75))

## $test
## <resample [1,150 x 58]> 5, 14, 20, 24, 29, 30, 36, 44, 47, 50, ...
## 
## $train
## <resample [3,451 x 58]> 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, ...

# Create folds
spam %>%
    crossv_kfold(5)

## # A tibble: 5 x 3
##   train        test         .id  
##   <named list> <named list> <chr>
## 1 <resample>   <resample>   1    
## 2 <resample>   <resample>   2    
## 3 <resample>   <resample>   3    
## 4 <resample>   <resample>   4    
## 5 <resample>   <resample>   5

To do prediction on time use createTimeSlices()

time <- 1:100
createTimeSlices(time, initialWindow = 20, horizon = 10) ->
    time_slices

time_slices$train[1]

## $Training20
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20

time_slices$test[1]

## $Testing20
##  [1] 21 22 23 24 25 26 27 28 29 30

You see that the first training set is 1:20, then the first test set is 21:30. So if you’re looking to do prediction, you train on your training time window, then test your predictions against the next test time window.

Training Options

The train function has a number of options:

weights allows you to upweight or downweight certain observations
metric sets the metric, could be ‘Accuracy’ or ‘Kappa’ for factors, or ‘RMSE’ or $R^2$ for others
trControl takes the trainControl() function to set a lot of others

Train Control

args(trainControl)

## function (method = "boot", number = ifelse(grepl("cv", method), 
##     10, 25), repeats = ifelse(grepl("[d_]cv$", method), 1, NA), 
##     p = 0.75, search = "grid", initialWindow = NULL, horizon = 1, 
##     fixedWindow = TRUE, skip = 0, verboseIter = FALSE, returnData = TRUE, 
##     returnResamp = "final", savePredictions = FALSE, classProbs = FALSE, 
##     summaryFunction = defaultSummary, selectionFunction = "best", 
##     preProcOptions = list(thresh = 0.95, ICAcomp = 3, k = 5, 
##         freqCut = 95/5, uniqueCut = 10, cutoff = 0.9), sampling = NULL, 
##     index = NULL, indexOut = NULL, indexFinal = NULL, timingSamps = 0, 
##     predictionBounds = rep(FALSE, 2), seeds = NA, adaptive = list(min = 5, 
##         alpha = 0.05, method = "gls", complete = TRUE), trim = FALSE, 
##     allowParallel = TRUE) 
## NULL

method is the method for resampling.
- ‘boot’
- ‘boot632’ for bootstrapping with adjustment
- ‘cv’ for cross-validation
- ‘repeatedcv’ for repeated cross-validation
- ‘LOOCV’ for leave-one-out cross validation
number is the number of folds or number of resampling iterations
repeats is the number of times to repeat subsampling. Can slow things down if large.
p determines the size of the training set
initalWindow gets passed to createTimeSlices() when the method is ‘timeslice’.
savePredictions indicates how much of the hold-out predictions for each resample should be saved.
summaryFunction is a function to compute performance metrics.

Plotting Predictors

Let’s look at example wages data.

library(ISLR)

# Buld a test/training set
Wage %>%
    resample_partition(c(train = .75, test = .25)) ->
    wage_smpl

# Plot the points and add linear regressions
Wage %>%
    ggplot(aes(age, wage, colour = education)) +
    geom_point() +
    geom_smooth(method = lm) +
    labs(
        title = 'Age vs Wage with Education Regression',
        x = 'Age',
        y = 'Wage',
        colour = 'Education Level'
    )

## Warning in grid.Call.graphics(C_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page

Cuting

You can use cut() to create categorical variables out of continuous ranges

Wage %>%
    mutate(wage_quartile = cut(wage, 4)) ->
    wage_data

wage_data %>%
    ggplot() +
    geom_boxplot(aes(wage, age, fill = wage_quartile)) +
    labs(
        title = 'Distribution of Wage Versus Age and Qage Quartile',
        x = 'Wage',
        y = 'Age',
        fill = 'Wage Quartile'
    )

Tables

Use tables:

wage_data %>% with(table(wage_quartile, jobclass))

##              jobclass
## wage_quartile 1. Industrial 2. Information
##   (19.8,94.6]           708            412
##   (94.6,169]            783            894
##   (169,244]              35             89
##   (244,319]              18             61

Proportion tables are also useful. The ‘1’ denotes the proportion of each row

wage_data %>%
    with(prop.table(table(wage_quartile, jobclass), 1))

##              jobclass
## wage_quartile 1. Industrial 2. Information
##   (19.8,94.6]     0.6321429      0.3678571
##   (94.6,169]      0.4669052      0.5330948
##   (169,244]       0.2822581      0.7177419
##   (244,319]       0.2278481      0.7721519

Density Plot

wage_data %>%
    ggplot() +
    geom_density(aes(wage, colour = education)) +
    labs(
        title = 'Density of Wage by Education',
        x = 'Wage',
        y = 'Density',
        colour = 'Education Level'
    )

Summary

Only make your plots based on the training set.
Look for
- Imbalance in outcomes/predictors
- Outliers
- Groups of points not explained by a predictor
- Skewed Variables

Preprocessing

Sometimes you may need to transform the predictors to make them more useful to certain models.

Standardising

Standardising is one of the ways to trasform. You take away the mean and divide by the standard deviation. The data set then has a mean of 0 and standard deviation of 1.

example_set <- sample(1:100)
mean(example_set)

## [1] 50.5

sd(example_set)

## [1] 29.01149

example_set_std <- ( example_set - mean(example_set) ) / sd(example_set)
mean(example_set_std)

## [1] 0

sd(example_set_std)

## [1] 1

One thing to remember is that when you standardise the test set, you need to use the mean and standard deviation from the training set.

The preProcess() function in the caret library can help. The objet that’s returned from the training set can be used to standardise the the training set. Th preProcess arguments can also be directly passed to the train() function (preProcess = c('train', 'scale')).

Box-Cox Transforms

The Box-Cox transform is a way to transform non-normal dependent variables into a normal shape.

Imputing Data

Most prediction algorithms don’t handle missing data. You can impute the missing variables using a k-nearest neighbour algorithm.

set.seed(1)
example_set[ sample(1:10, 4) ] <- NA

tibble(x = example_set) %>%
    preProcess(method = 'knnImpute')

## Created from 96 samples and 1 variables
## 
## Pre-processing:
##   - centered (1)
##   - ignored (0)
##   - 5 nearest neighbor imputation (1)
##   - scaled (1)

Covariate Creation

Sometimes called predictors, sometimes called features.

Two levels of covariate creation:

From raw data to covariate.
- e.g. an email into properties like average captital letters, etc
Transforming tidy coviariates

Level 1 - Raw to Covariates

Thos depends heavily on the application. The balancing act is going to be summarisation verssus information loss.

Examples:

Test files: word frequency, phrase frequency (Google ngrams), capital letter frequency.
Images: Edges, corners, blobs, ridges.
Webpages: Number and type of images, position of elements, colours, videos.
People: height, weight, sex, country of origin.

When in doubt, err on the side of more features. You can automate this, but proceed with caution because some features will be very useful in a training set, but won’t generalise well in a test set.

Level 2 - Tidy Covariates

This is more necessary for some methods (regression, support vector machines) than others (classification trees). It should only be done on the training set. The new covariates should be added to the data frames.

Dummy Variables

These are a common variable to add:

Wage %>%
    dummyVars(wage ~ jobclass, data = .) %>%
    predict(newdata = Wage) %>%
    head()

##        jobclass.1. Industrial jobclass.2. Information
## 231655                      1                       0
## 86582                       0                       1
## 161300                      1                       0
## 155159                      0                       1
## 11443                       0                       1
## 376662                      0                       1

The first varaible is an indicator that you’re industrial, the second that you’re in an informational sector. The leftmost column is the response variable, in our instance this is wage.

Removing Zero Covariates

f Some variabiles have no variation. Can use nearZeroVar():

Wage %>%
    nearZeroVar(saveMetrics = TRUE)

##            freqRatio percentUnique zeroVar   nzv
## year        1.057732    0.23333333   FALSE FALSE
## age         1.153061    2.03333333   FALSE FALSE
## maritl      3.200617    0.16666667   FALSE FALSE
## race        8.464164    0.13333333   FALSE FALSE
## education   1.417518    0.16666667   FALSE FALSE
## region      0.000000    0.03333333    TRUE  TRUE
## jobclass    1.060440    0.06666667   FALSE FALSE
## health      2.496503    0.06666667   FALSE FALSE
## health_ins  2.271538    0.06666667   FALSE FALSE
## logwage     1.052632   16.93333333   FALSE FALSE
## wage        1.052632   16.93333333   FALSE FALSE

freqRation is the ratio of frequencies for the most common value over the second most common value.
percentUnique is the percentage of unique data points out of the total number of data points.
zeroVar is a vector of logicals for whether the predictor has only only one distinct value.
nsz is a vector of logicals for whether the predictor is a near-zero variance predictor.

Spine Basis

Sometimes you want to splines as a basis function so you get a curved lines rather than straight lines:

library(splines)

wage_smpl$train %>%
    as_tibble() %>%
    ggplot(aes(wage, age)) +
    geom_point() +
    geom_smooth(method = lm, formula = y ~ bs(x, df = 3)) +
    labs(
        title = "Wage versus Age (Third Degreee Spline)",
        x = 'Age',
        y = 'Wage'
   )

## Warning in grid.Call.graphics(C_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page

Principal Components Analysis

Often you have multiple quantitative variables that are highly correlated with each other. In this case it’s not useful to include every variable in the model. You might want to include some summary that captures most of the information in those quantitative variables.

mtcars %>%
    select_if(is.numeric) %>%
    cor() %>%
    abs() ->
    wage_cor

diag(wage_cor) <- 0
which(wage_cor > .9, arr.ind = T)

##      row col
## disp   3   2
## cyl    2   3

So there’s a high correlation between displacement and the number of cyclinders.

mtcars %>%
    ggplot(aes(cyl, disp)) +
    geom_point() +
    labs(
        title = 'Cylinders versus Displacement',
        x = 'Cylinders',
        y = 'Displacement'
    )

The basic idea of PCA is:

We might not need every predictor.
A weighted combination of predictors might be better.
We should pick this combination to capture ‘the most information possible’,
Benefits:
- Reduced number of predictors
- Reduced noise due to averaging.

SVD & PCA

Singular Value Decomposition: if $X$ is a matrix with each variable in a column and each ovservation in a row then the SVD is a matrix decomposition:

$X = UDV^T$

where the columns of $u$ are orthogonal (left singular vectors), the columns of $V$ are orthogonal (right singular vectors) and $D$ is a diagonal matrix (singular values).

Principal Components Analysis: are equal to the right singular values of you first scale the variables.

set.seed(493875)
tibble(
    x = rnorm(20),
    y = 3 * x + rnorm(20),
    n = 1:20
) -> ex_data

ex_data %>%
    ggplot(aes(x,y)) +
    geom_point() +
    geom_text(aes(label = n), vjust = 1.5) +
    labs(
        title = 'Example Data',
        x = 'X',
        y = 'Y'
    )

ex_data %>%
    select(-n) %>%
    prcomp() ->
    ex_pca

ex_data %>%
    ggplot() +
    geom_point(aes(ex_pca$x[,1], ex_pca$x[,2])) +
    geom_text(aes(ex_pca$x[,1], ex_pca$x[,2], label = n), vjust = 1.5) +
    labs(
        title = 'Example PCA',
        x = 'First Principal Component',
        y = 'Second Principal Component'
    )

This is useful for linear-type models, but it can make it harder to interpret predictors.

Watch out for outliers: transform first (log), and plot predictors to identify problems.

Predicting with Regression

The key ideas:

Fit a simple regression model.
Plug in new covariates and multiple by the coefficients.
Useful when the linear model is (nearly) correct.

We’ll use data on “Old Faithful’s” eruptions.

set.seed(1)

faithful %>%
    resample_partition(c(train = .8, test = .2)) ->
    faithful_smpl

faithful_smpl %>%
    extract2('train') %>%
    as_tibble() %>%
    ggplot() +
    geom_point(aes(waiting, eruptions)) +
    labs(
        title = 'Old Faithful: Waiting Time vs Euruption Time',
        x = 'Waiting Time',
        y = 'Eruption Duration'
    )

It looks like a linear relationship, so we fit a linear model:

faithful_smpl %>%
    extract2('train') %>%
    as_tibble() %>%
    lm(eruptions ~ waiting, data = .) ->
    faithful_lm

summary(faithful_lm)

## 
## Call:
## lm(formula = eruptions ~ waiting, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.28619 -0.39513  0.03994  0.35570  1.21452 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.84302    0.17832  -10.34   <2e-16 ***
## waiting      0.07494    0.00250   29.98   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5054 on 215 degrees of freedom
## Multiple R-squared:  0.8069, Adjusted R-squared:  0.806 
## F-statistic: 898.5 on 1 and 215 DF,  p-value: < 2.2e-16

We can predict a new value based on waiting times of 30 and 60 minutes:

tibble(
    waiting = c(30,60)
) %>%
    predict(faithful_lm, newdata = .)

##         1         2 
## 0.4052043 2.6534262

Let’s look at the training and test set errors:

# Training error
faithful_smpl %>%
    extract2('train') %>%
    as_tibble() %>%
    mutate(pred = predict(faithful_lm, newdata = .)) %>%
    summarise(RMSE = sqrt(sum((eruptions - pred)^2)))

## # A tibble: 1 x 1
##    RMSE
##   <dbl>
## 1  7.41

faithful_smpl %>%
    extract2('test') %>%
    as_tibble() %>%
    mutate(pred = predict(faithful_lm, newdata = .)) %>%
    summarise(RMSE = sqrt(sum((eruptions - pred)^2)))

## # A tibble: 1 x 1
##    RMSE
##   <dbl>
## 1  3.43

Regression with Multiple Covariates

One useful diagnostic plot is to plot the residuals against the row order to see whether there is some other variable missing. This would be related to time or age or some other continues variable that the rows are ordered by.

wage_smpl %>%
    extract2('train') %>%
    lm(wage ~ age + jobclass + education, data = .) ->
    wage_lm

wage_lm %>%
    augment() %>%
    mutate(index = row_number()) %>%
    ggplot() +
    geom_point(aes(index, .resid))

Quiz

Question 1

Load the Alzheimer’s disease data using the commands:

library(AppliedPredictiveModeling)
data(AlzheimerDisease)

Which of the following commands will create non-overlapping training and test sets with about 50% of the observations assigned to each?

adData = data.frame(diagnosis,predictors)
trainIndex = createDataPartition(diagnosis,p=0.5,list=FALSE)
training = adData[-trainIndex,]
testing = adData[-trainIndex,]

adData = data.frame(diagnosis,predictors)
trainIndex = createDataPartition(diagnosis,p=0.5,list=FALSE)
training = adData[trainIndex,]
testing = adData[trainIndex,]

adData = data.frame(diagnosis,predictors)
testIndex = createDataPartition(diagnosis, p = 0.50,list=FALSE)
training = adData[-testIndex,]
testing = adData[testIndex,]

adData = data.frame(diagnosis,predictors)
train = createDataPartition(diagnosis, p = 0.50,list=FALSE)
test = createDataPartition(diagnosis, p = 0.50,list=FALSE)

Answer: The answer is 3.

Question 2

Load the cement data using the commands:

library(AppliedPredictiveModeling)
data(concrete)
library(caret)
set.seed(1000)
inTrain = createDataPartition(mixtures$CompressiveStrength, p = 3/4)[[1]]
training = mixtures[ inTrain,]
testing = mixtures[-inTrain,]

Make a plot of the outcome (CompressiveStrength) versus the index of the samples. Color by each of the variables in the data set (you may find the cut2() function in the Hmisc package useful for turning continuous covariates into factors). What do you notice in these plots?

training %>%
    mutate(n = row_number()) %>%
    ggplot() +
    geom_point(aes(n, CompressiveStrength, colour = Age))

training %>%
    mutate(n = row_number()) %>%
    ggplot() +
    geom_point(aes(n, CompressiveStrength, colour = FlyAsh))

There is a non-random pattern in the plot of the outcome versus index that does not appear to be perfectly explained by any predictor suggesting a variable may be missing.
There is a non-random pattern in the plot of the outcome versus index that is perfectly explained by the Age variable.
There is a non-random pattern in the plot of the outcome versus index.
There is a non-random pattern in the plot of the outcome versus index that is perfectly explained by the FlyAsh variable.

Question 3

Load the cement data using the commands:

library(AppliedPredictiveModeling)
data(concrete)
library(caret)
set.seed(1000)
inTrain = createDataPartition(mixtures$CompressiveStrength, p = 3/4)[[1]]
training = mixtures[ inTrain,]
testing = mixtures[-inTrain,]

Make a histogram and confirm the SuperPlasticizer variable is skewed. Normally you might use the log transform to try to make the data more symmetric. Why would that be a poor choice for this variable?

The log transform does not reduce the skewness of the non-zero values of SuperPlasticizer
There are a large number of values that are the same and even if you took the log(SuperPlasticizer + 1) they would still all be identical so the distribution would not be symmetric.
The SuperPlasticizer data include negative values so the log transform can not be performed.
The log transform is not a monotone transformation of the data.

training %>%
    ggplot() +
    geom_histogram(aes(Superplasticizer))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

training %>%
    ggplot() +
    geom_histogram(aes(log(Superplasticizer + 1)))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Question 4

Load the Alzheimer’s disease data using the commands:

library(caret)
library(AppliedPredictiveModeling)
set.seed(3433)
data(AlzheimerDisease)
adData = data.frame(diagnosis,predictors)
inTrain = createDataPartition(adData$diagnosis, p = 3/4)[[1]]
training = adData[ inTrain,]
testing = adData[-inTrain,]

Find all the predictor variables in the training set that begin with IL. Perform principal components on these variables with the preProcess() function from the caret package. Calculate the number of principal components needed to capture 90% of the variance. How many are there?

training %>%
    select_at(vars(starts_with('IL'))) %>%
    preProcess(method = 'pca', thresh = .9)

## Created from 251 samples and 12 variables
## 
## Pre-processing:
##   - centered (12)
##   - ignored (0)
##   - principal component signal extraction (12)
##   - scaled (12)
## 
## PCA needed 9 components to capture 90 percent of the variance

Question 5

Load the Alzheimer’s disease data using the commands:

library(caret)
library(AppliedPredictiveModeling)
set.seed(3433)
data(AlzheimerDisease)
adData = data.frame(diagnosis,predictors)
inTrain = createDataPartition(adData$diagnosis, p = 3/4)[[1]]
training = adData[ inTrain,]
testing = adData[-inTrain,]

Create a training data set consisting of only the predictors with variable names beginning with IL and the diagnosis. Build two predictive models, one using the predictors as they are and one using PCA with principal components explaining 80% of the variance in the predictors. Use method=“glm” in the train function.

What is the accuracy of each method in the test set? Which is more accurate?

Non-PCA Accuracy: 0.72 / PCA Accuracy: 0.65
Non-PCA Accuracy: 0.65 / PCA Accuracy: 0.72
Non-PCA Accuracy: 0.74 / PCA Accuracy: 0.74
Non-PCA Accuracy: 0.72 / PCA Accuracy: 0.71

training %>%
    select_at(vars('diagnosis', starts_with('IL'))) %>%
    train(diagnosis ~ ., method = 'glm', data = .) %>%
    predict(newdata = testing) %>%
    confusionMatrix(testing$diagnosis)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Impaired Control
##   Impaired        4       2
##   Control        18      58
##                                           
##                Accuracy : 0.7561          
##                  95% CI : (0.6488, 0.8442)
##     No Information Rate : 0.7317          
##     P-Value [Acc > NIR] : 0.3606488       
##                                           
##                   Kappa : 0.1929          
##                                           
##  Mcnemar's Test P-Value : 0.0007962       
##                                           
##             Sensitivity : 0.18182         
##             Specificity : 0.96667         
##          Pos Pred Value : 0.66667         
##          Neg Pred Value : 0.76316         
##              Prevalence : 0.26829         
##          Detection Rate : 0.04878         
##    Detection Prevalence : 0.07317         
##       Balanced Accuracy : 0.57424         
##                                           
##        'Positive' Class : Impaired        
##

tc = trainControl(preProcOptions = list(thresh = .8))

training %>%
    select_at(vars('diagnosis', starts_with('IL'))) %>%
    train(diagnosis ~ ., method = 'glm', preProcess = 'pca', trControl = tc, data = .) %>%
    predict(newdata = testing) %>%
    confusionMatrix(testing$diagnosis)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Impaired Control
##   Impaired        1       2
##   Control        21      58
##                                           
##                Accuracy : 0.7195          
##                  95% CI : (0.6094, 0.8132)
##     No Information Rate : 0.7317          
##     P-Value [Acc > NIR] : 0.6517805       
##                                           
##                   Kappa : 0.0167          
##                                           
##  Mcnemar's Test P-Value : 0.0001746       
##                                           
##             Sensitivity : 0.04545         
##             Specificity : 0.96667         
##          Pos Pred Value : 0.33333         
##          Neg Pred Value : 0.73418         
##              Prevalence : 0.26829         
##          Detection Rate : 0.01220         
##    Detection Prevalence : 0.03659         
##       Balanced Accuracy : 0.50606         
##                                           
##        'Positive' Class : Impaired        
##

Course 8 - Practical Machine Learning - Week 2 - Notes

Greg Foletta

2020-03-03

Caret Package

Data Slicing

Training Options

Train Control

Plotting Predictors

Cuting

Tables

Density Plot

Summary

Preprocessing

Standardising

Box-Cox Transforms

Imputing Data

Covariate Creation

Level 1 - Raw to Covariates

Level 2 - Tidy Covariates

Dummy Variables

Removing Zero Covariates

Spine Basis

Principal Components Analysis

SVD & PCA

Predicting with Regression

Regression with Multiple Covariates

Quiz

Question 1

Question 2

Question 3

Question 4

Question 5