Random Forest

Library

###Loading the data

analysis_train <- readRDS(here("models", "data", "analysis_train.rds"))
analysis_data <- readRDS(here("models", "data", "analysis_data.rds"))
analysis_folds <- readRDS(here("models", "data", "analysis_folds.rds"))

###SPLIT THE DATA

set.seed(2023)

Weapon_split <- initial_split(analysis_data, 
                               strata = WeaponCarryingSchool)

weapon_train <- training(Weapon_split)
weapon_test <- testing(Weapon_split)

Weapon_split

<Training/Testing/Total>
<14696/4899/19595>

###Lets check our work

weapon_train |> 
  tabyl(WeaponCarryingSchool)  |> 
  adorn_pct_formatting(0) |> 
  adorn_totals()

 WeaponCarryingSchool     n percent
                    0 14042     96%
                    1   654      4%
                Total 14696       -

weapon_test |>  
  tabyl(WeaponCarryingSchool)  |> 
  adorn_pct_formatting(0) |> 
  adorn_totals()

 WeaponCarryingSchool    n percent
                    0 4705     96%
                    1  194      4%
                Total 4899       -

set.seed(2023)

cv_weapon <- rsample::vfold_cv(weapon_train, 
                                v= 5,
                                strata = WeaponCarryingSchool)
cv_weapon

#  5-fold cross-validation using stratification 
# A tibble: 5 × 2
  splits               id   
  <list>               <chr>
1 <split [11756/2940]> Fold1
2 <split [11757/2939]> Fold2
3 <split [11757/2939]> Fold3
4 <split [11757/2939]> Fold4
5 <split [11757/2939]> Fold5

weapon_recipe <- 
  recipe(formula = WeaponCarryingSchool ~ ., data = weapon_train) |>
  step_impute_mode(all_nominal_predictors()) |>
  step_impute_mean(all_numeric_predictors()) |> 
  step_dummy(all_nominal_predictors())

weapon_spec <- 
  rand_forest(
    # the number of predictors to sample at each split
    mtry = tune(), 
    # the number of observations needed to keep splitting nodes
    min_n = tune(),
    trees = 100) |>  
  set_mode("classification") |>  
  set_engine("ranger", 
             # This is essential for vip()
             importance = "permutation") 

weapon_spec

Random Forest Model Specification (classification)

Main Arguments:
  mtry = tune()
  trees = 100
  min_n = tune()

Engine-Specific Arguments:
  importance = permutation

Computational engine: ranger

weapon_workflow <- 
  workflow() |> 
  add_recipe(weapon_recipe) |>  
  add_model(weapon_spec) 

weapon_workflow

══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: rand_forest()

── Preprocessor ────────────────────────────────────────────────────────────────
3 Recipe Steps

• step_impute_mode()
• step_impute_mean()
• step_dummy()

── Model ───────────────────────────────────────────────────────────────────────
Random Forest Model Specification (classification)

Main Arguments:
  mtry = tune()
  trees = 100
  min_n = tune()

Engine-Specific Arguments:
  importance = permutation

Computational engine: ranger

doParallel::registerDoParallel()
  
set.seed(46257)
  
weapon_tune <-
  tune_grid(
    weapon_workflow,
    resamples = cv_weapon,
# grid = 11 says to choose 11 parameter sets automatically 
    grid = 11)

i Creating pre-processing data to finalize unknown parameter: mtry

Warning: ! tune detected a parallel backend registered with foreach but no backend
  registered with future.
ℹ Support for parallel processing with foreach was soft-deprecated in tune
  1.2.1.
ℹ See ?parallelism (`?tune::parallelism()`) to learn more.

doParallel::stopImplicitCluster() 
weapon_tune

# Tuning results
# 5-fold cross-validation using stratification 
# A tibble: 5 × 4
  splits               id    .metrics          .notes          
  <list>               <chr> <list>            <list>          
1 <split [11756/2940]> Fold1 <tibble [33 × 6]> <tibble [0 × 3]>
2 <split [11757/2939]> Fold2 <tibble [33 × 6]> <tibble [0 × 3]>
3 <split [11757/2939]> Fold3 <tibble [33 × 6]> <tibble [0 × 3]>
4 <split [11757/2939]> Fold4 <tibble [33 × 6]> <tibble [0 × 3]>
5 <split [11757/2939]> Fold5 <tibble [33 × 6]> <tibble [0 × 3]>

collect_metrics(weapon_tune)

# A tibble: 33 × 8
    mtry min_n .metric     .estimator   mean     n std_err .config              
   <int> <int> <chr>       <chr>       <dbl> <int>   <dbl> <chr>                
 1     1    17 accuracy    binary     0.955      5 0.00160 Preprocessor1_Model01
 2     1    17 brier_class binary     0.0417     5 0.00137 Preprocessor1_Model01
 3     1    17 roc_auc     binary     0.673      5 0.00677 Preprocessor1_Model01
 4     1    32 accuracy    binary     0.955      5 0.00160 Preprocessor1_Model02
 5     1    32 brier_class binary     0.0417     5 0.00137 Preprocessor1_Model02
 6     1    32 roc_auc     binary     0.673      5 0.00931 Preprocessor1_Model02
 7     2     5 accuracy    binary     0.955      5 0.00160 Preprocessor1_Model03
 8     2     5 brier_class binary     0.0417     5 0.00128 Preprocessor1_Model03
 9     2     5 roc_auc     binary     0.672      5 0.00663 Preprocessor1_Model03
10     3    21 accuracy    binary     0.955      5 0.00160 Preprocessor1_Model04
# ℹ 23 more rows

autoplot(weapon_tune)

best_weapon <- select_best(weapon_tune, metric = "roc_auc")
best_weapon

# A tibble: 1 × 3
   mtry min_n .config              
  <int> <int> <chr>                
1     1    32 Preprocessor1_Model02

weapon_final_wf <- finalize_workflow(weapon_workflow, best_weapon)
weapon_final_wf

══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: rand_forest()

── Preprocessor ────────────────────────────────────────────────────────────────
3 Recipe Steps

• step_impute_mode()
• step_impute_mean()
• step_dummy()

── Model ───────────────────────────────────────────────────────────────────────
Random Forest Model Specification (classification)

Main Arguments:
  mtry = 1
  trees = 100
  min_n = 32

Engine-Specific Arguments:
  importance = permutation

Computational engine: ranger

weapon_fit <- fit(weapon_final_wf, weapon_train)
weapon_fit

══ Workflow [trained] ══════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: rand_forest()

── Preprocessor ────────────────────────────────────────────────────────────────
3 Recipe Steps

• step_impute_mode()
• step_impute_mean()
• step_dummy()

── Model ───────────────────────────────────────────────────────────────────────
Ranger result

Call:
 ranger::ranger(x = maybe_data_frame(x), y = y, mtry = min_cols(~1L,      x), num.trees = ~100, min.node.size = min_rows(~32L, x),      importance = ~"permutation", num.threads = 1, verbose = FALSE,      seed = sample.int(10^5, 1), probability = TRUE) 

Type:                             Probability estimation 
Number of trees:                  100 
Sample size:                      14696 
Number of independent variables:  10 
Mtry:                             1 
Target node size:                 32 
Variable importance mode:         permutation 
Splitrule:                        gini 
OOB prediction error (Brier s.):  0.04166282

weapon_pred <- 
  augment(weapon_fit, weapon_train) |> 
  select(WeaponCarryingSchool, .pred_class, .pred_1, .pred_0)

weapon_pred

# A tibble: 14,696 × 4
   WeaponCarryingSchool .pred_class .pred_1 .pred_0
   <fct>                <fct>         <dbl>   <dbl>
 1 0                    0            0.0489   0.951
 2 0                    0            0.0430   0.957
 3 0                    0            0.0351   0.965
 4 0                    0            0.0412   0.959
 5 0                    0            0.0327   0.967
 6 0                    0            0.0880   0.912
 7 0                    0            0.0489   0.951
 8 0                    0            0.0324   0.968
 9 0                    0            0.0488   0.951
10 0                    0            0.0324   0.968
# ℹ 14,686 more rows

weapon_roc_plot <- 
  weapon_pred |> 
  roc_curve(truth = WeaponCarryingSchool, 
           .pred_1, 
           event_level = "second") |> 
  autoplot()

weapon_roc_plot

saveRDS(weapon_roc_plot, here("models", "roc_graphs", "random_forest.rds"))

weapon_pred |> 
  roc_auc(truth = WeaponCarryingSchool, 
           .pred_1, 
           event_level = "second")

# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.683