Machine Learning

By Tainá Carreira da Rocha


The final report of Machine Learning course from curso-r company is a prediction about who will purchase at Google Virtual Store in the next month.


Read the data

ga = readr::read_csv("data/ga_train.csv") 
## Rows: 1,061,278
## Columns: 38
## $ month                        <date> 2016-09-01, 2016-09-01, 2016-09-01, 2016…
## $ fullVisitorId                <chr> "000005103959234087", "000011415654313568…
## $ last_channel_grouping        <chr> "Organic Search", "Social", "Social", "So…
## $ last_ses_from_the_period_end <dbl> 11, 24, 23, 12, 5, 15, 7, 17, 13, 6, 17, …
## $ interval_dates               <dbl> 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0,…
## $ unique_date_num              <dbl> 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1,…
## $ max_visit_num                <dbl> 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 5,…
## $ last_browser                 <chr> "Chrome", "Safari", "Opera Mini", "Chrome…
## $ last_deviceCategory          <chr> "mobile", "desktop", "mobile", "desktop",…
## $ last_continent               <chr> "Americas", "Asia", "Africa", "Asia", "Eu…
## $ last_operatingSystem         <chr> "Android", "Macintosh", "(not set)", "Win…
## $ last_subContinent            <chr> "Northern America", "Western Asia", "Nort…
## $ last_country                 <chr> "United States", "Turkey", "Sudan", "Phil…
## $ last_region                  <chr> "not available in demo dataset", "Istanbu…
## $ last_metro                   <chr> "not available in demo dataset", "(not se…
## $ last_city                    <chr> "not available in demo dataset", "Istanbu…
## $ last_networkDomain           <chr> "", "", "opera-min…
## $ last_source                  <chr> "google", "", "", "…
## $ last_medium                  <chr> "organic", "referral", "referral", "refer…
## $ prop_isMobile                <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ prop_isTrueDirect            <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0…
## $ sum_hits                     <dbl> 10, 1, 1, 1, 1, 2, 2, 3, 2, 1, 2, 4, 46, …
## $ mean_hits                    <dbl> 10.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 3.0, …
## $ min_hits                     <dbl> 10, 1, 1, 1, 1, 2, 1, 3, 2, 1, 2, 4, 46, …
## $ max_hits                     <dbl> 10, 1, 1, 1, 1, 2, 1, 3, 2, 1, 2, 4, 46, …
## $ median_hits                  <dbl> 10.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 3.0, …
## $ sd_hits                      <dbl> NA, NA, NA, NA, NA, NA, 0.00000, NA, NA, …
## $ sum_pageviews                <dbl> 8, 1, 1, 1, 1, 2, 2, 3, 2, 1, 2, 3, 31, 2…
## $ mean_pageviews               <dbl> 8.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 3.0, 2…
## $ min_pageviews                <dbl> 8, 1, 1, 1, 1, 2, 1, 3, 2, 1, 2, 3, 31, 2…
## $ max_pageviews                <dbl> 8, 1, 1, 1, 1, 2, 1, 3, 2, 1, 2, 3, 31, 2…
## $ median_pageviews             <dbl> 8.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 3.0, 2…
## $ sd_pageviews                 <dbl> NA, NA, NA, NA, NA, NA, 0.00000, NA, NA, …
## $ bounce_sessions              <dbl> 0, 1, 1, 1, 1, 0, 2, 0, 0, 1, 0, 0, 0, 0,…
## $ session_cnt                  <dbl> 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1,…
## $ totalTransactionRevenue      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 97…
## $ transactions                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,…
## $ comprou                      <chr> "não", "não", "não", "não", "não", "não",…
ga |> 
## # A tibble: 2 × 2
##   comprou       n
##   <chr>     <int>
## 1 não     1058330
## 2 sim        2948

Train and test data

ga_initial_split = make_splits(
  x = list(
    analysis = which(!as.character(ga$month) %in% c("2018-01-01", "2018-02-01")),
    assessment = which(as.character(ga$month) %in% c("2018-01-01", "2018-02-01"))
  data = ga

ga_train = training(ga_initial_split)
ga_valid = testing(ga_initial_split)


ga_resamples = sliding_period(ga_train, index = month, period = "month",
                               lookback = 5, step=2)

Exploratory analysis


Table: Table 1: Data summary

Name ga_train
Number of rows 930624
Number of columns 38
Column type frequency:
character 15
Date 1
numeric 22
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
fullVisitorId 0 1 14 20 0 885756 0
last_channel_grouping 0 1 6 14 0 8 0
last_browser 0 1 1 26 0 64 0
last_deviceCategory 0 1 6 7 0 3 0
last_continent 0 1 4 9 0 6 0
last_operatingSystem 0 1 3 16 0 22 0
last_subContinent 0 1 9 18 0 23 0
last_country 0 1 4 24 0 226 0
last_region 0 1 4 33 0 402 0
last_metro 0 1 6 55 0 99 0
last_city 0 1 3 33 0 745 0
last_networkDomain 0 1 2 64 0 32301 0
last_source 0 1 3 31 0 293 0
last_medium 0 1 3 9 0 7 0
comprou 0 1 3 3 0 2 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
month 0 1 2016-09-01 2017-12-01 2017-04-01 16

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
last_ses_from_the_period_end 0 1.00 13.70 7.84 1 7.00 14.00 20.00 2.700000e+01 ▇▆▆▆▇
interval_dates 0 1.00 0.47 2.34 0 0.00 0.00 0.00 2.600000e+01 ▇▁▁▁▁
unique_date_num 0 1.00 1.11 0.52 1 1.00 1.00 1.00 2.600000e+01 ▇▁▁▁▁
max_visit_num 0 1.00 1.49 3.41 1 1.00 1.00 1.00 4.080000e+02 ▇▁▁▁▁
prop_isMobile 0 1.00 1.00 0.00 1 1.00 1.00 1.00 1.000000e+00 ▁▁▇▁▁
prop_isTrueDirect 0 1.00 0.23 0.40 0 0.00 0.00 0.50 1.000000e+00 ▇▁▁▁▂
sum_hits 0 1.00 5.42 13.45 1 1.00 2.00 4.00 1.541000e+03 ▇▁▁▁▁
mean_hits 0 1.00 4.04 7.53 1 1.00 1.50 4.00 5.000000e+02 ▇▁▁▁▁
min_hits 0 1.00 3.58 6.97 1 1.00 1.00 3.00 5.000000e+02 ▇▁▁▁▁
max_hits 0 1.00 4.67 9.70 1 1.00 2.00 4.00 5.000000e+02 ▇▁▁▁▁
median_hits 0 1.00 3.96 7.45 1 1.00 1.00 4.00 5.000000e+02 ▇▁▁▁▁
sd_hits 819003 0.12 5.29 9.61 0 0.55 2.08 6.36 3.507200e+02 ▇▁▁▁▁
sum_pageviews 0 1.00 4.54 10.03 0 1.00 2.00 4.00 1.445000e+03 ▇▁▁▁▁
mean_pageviews 12 1.00 3.43 5.50 1 1.00 1.00 3.00 4.310000e+02 ▇▁▁▁▁
min_pageviews 0 1.00 Inf NaN 1 1.00 1.00 3.00 Inf ▇▁▁▁▁
max_pageviews 0 1.00 -Inf NaN -Inf 1.00 1.00 4.00 4.830000e+02 ▇▁▁▁▁
median_pageviews 12 1.00 3.37 5.45 1 1.00 1.00 3.00 4.310000e+02 ▇▁▁▁▁
sd_pageviews 819051 0.12 4.01 6.78 0 0.50 1.41 4.95 2.432400e+02 ▇▁▁▁▁
bounce_sessions 0 1.00 0.62 0.73 0 0.00 1.00 1.00 6.300000e+01 ▇▁▁▁▁
session_cnt 0 1.00 1.21 0.86 1 1.00 1.00 1.00 8.100000e+01 ▇▁▁▁▁
totalTransactionRevenue 0 1.00 2129194.73 119342544.05 0 0.00 0.00 0.00 9.277596e+10 ▇▁▁▁▁
transactions 0 1.00 0.01 0.14 0 0.00 0.00 0.00 1.500000e+01 ▇▁▁▁▁


ga_train |>
   select(where(is.numeric)) |>
   cor(use = "pairwise.complete.obs") |>
Decision tree

Data prep

ga_dt_recipe = recipe(comprou ~ ., data = ga_train) |>
  update_role(month, new_role = "date") |>
  update_role(fullVisitorId, new_role = "id") |>
  step_rm(skip = TRUE,
  ) |>
  themis::step_downsample(comprou, under_ratio = 10) |>
  step_novel(all_nominal_predictors()) |>
  step_zv(all_predictors()) |>


ga_dt_model = decision_tree(
  cost_complexity = tune(),
  tree_depth = tune(),
  min_n = tune()
) |>
  set_mode("classification") |>


ga_dt_wf = workflow() |>
  add_model(ga_dt_model) |>


grid_dt = grid_random(
  cost_complexity(c(-9, -1)),
  tree_depth(range = c(5, 15)),
  min_n(range = c(20, 40)),
  size = 3

ga_dt_tune_grid = tune_grid(
  resamples = ga_resamples,
  grid = grid_dt,
  metrics = metric_set(roc_auc),
  control = control_grid(verbose = TRUE)
## # A tibble: 3 × 9
##   cost_complexity tree_depth min_n .metric .estima…¹  mean     n std_err .config
##             <dbl>      <int> <int> <chr>   <chr>     <dbl> <int>   <dbl> <chr>  
## 1     0.000000372         13    23 roc_auc binary    0.907     5 0.00320 Prepro…
## 2     0.00000360          11    34 roc_auc binary    0.915     5 0.00340 Prepro…
## 3     0.00000132           6    31 roc_auc binary    0.841     5 0.0187  Prepro…
## # … with abbreviated variable name ¹​.estimator

Model performance

ga_dt_best_params = select_best(ga_dt_tune_grid, "roc_auc")
ga_dt_wf = ga_dt_wf |> finalize_workflow(ga_dt_best_params)
ga_dt_last_fit = last_fit(ga_dt_wf, ga_initial_split)

ga_test_preds = collect_predictions(ga_dt_last_fit) |> mutate(modelo = "dt")


ga_test_preds |>
  group_by(modelo) |>
  roc_curve(comprou, `.pred_não`) |>

Variable importance

ga_dt_last_fit_model = ga_dt_last_fit$.workflow[[1]]$fit$fit
rpart.plot(ga_dt_last_fit_model$fit, faclen = 2)
Final Model

ga_final_dt_model = ga_dt_wf |> 

Submission file

ga_test = readr::read_csv("data/ga_test.csv")
ga_submission = ga_test %>%
    target = predict(ga_final_dt_model, new_data = . , type = "prob")$.pred_sim

ga_submission |>
  mutate(fullVisitorId = paste(fullVisitorId, month, sep = "-")) |>
  select(fullVisitorId, comprou = target) |>


