# check if 'librarian' is installed and if not, install it
if (! "librarian" %in% rownames(installed.packages()) ){
install.packages("librarian")
}
# load packages if not already loaded
::shelf(
librarian
ggplot2, magrittr, tidymodels, tidyverse, rsample, broom, recipes, parsnip, modeldata
)
# set the efault theme for plotting
theme_set(theme_bw(base_size = 18) + theme(legend.position = "top"))
BSMM-quiz-1
SOLUTIONS
Packages
Quiz-1 (part 1)
Q-1
Is this data a tidy
dataset?
Region | < $1M | $1 - $5M | $5 - $10M | $10 - $100M | > $100M |
---|---|---|---|---|---|
N America | $50M | $324M | $1045M | $941M | $1200M |
EMEA | $10M | $121M | $77M | $80M | $0M |
Delete the wrong answer:
# original table
<- tibble::tibble(
dat Region = c('N America', 'EMEA')
'< $1M' = c('$50M', '$10M')
, '$1 - $5M' = c('$324M', '$121M')
, '$5 - $10M' = c('$1045M', '$77M')
, '$10 - $100M' = c('$941M', '$80M')
, '> $100M' = c('$1200M', '$0M')
,
) |> gt::gt() |> gtExtras::gt_theme_espn() dat
Region | < $1M | $1 - $5M | $5 - $10M | $10 - $100M | > $100M |
---|---|---|---|---|---|
N America | $50M | $324M | $1045M | $941M | $1200M |
EMEA | $10M | $121M | $77M | $80M | $0M |
# transformed table
|>
dat ::pivot_longer(-Region, names_to = "range", values_to = "$ amount")|>
tidyr::gt() |> gtExtras::gt_theme_espn() gt
Region | range | $ amount |
---|---|---|
N America | < $1M | $50M |
N America | $1 - $5M | $324M |
N America | $5 - $10M | $1045M |
N America | $10 - $100M | $941M |
N America | > $100M | $1200M |
EMEA | < $1M | $10M |
EMEA | $1 - $5M | $121M |
EMEA | $5 - $10M | $77M |
EMEA | $10 - $100M | $80M |
EMEA | > $100M | $0M |
Q-2
Which resampling method from the resample::
package randomly partitions the data into V sets of roughly equal size?
Q-3
If I join the two tables below as follows:
::????_join(employees, departments, by = "department_id") dplyr
which type of join would include employee_name == Moe Syzslak?
- inner
- left
- right
- all of the above
Delete the incorrect answers.
employees - This table contains each employee’s ID, name, and department ID.
id | employee_name | department_id |
---|---|---|
1 | Homer Simpson | 4 |
2 | Ned Flanders | 1 |
3 | Barney Gumble | 5 |
4 | Clancy Wiggum | 3 |
5 | Moe Syzslak | NA |
departments - This table contains each department’s ID and name.
department_id | department_name |
---|---|
1 | Sales |
2 | Engineering |
3 | Human Resources |
4 | Customer Service |
5 | Research And Development |
<- tibble::tribble(
tbl1 ~id , ~employee_name, ~department_id
1 ,'Homer Simpson' ,4
,2 ,'Ned Flanders' ,1
,3 ,'Barney Gumble' ,5
,4 ,'Clancy Wiggum' ,3
,5 ,'Moe Syzslak' ,NA
,
)
<- tibble::tribble(
tbl2 ~department_id ,~department_name
1 ,"Sales"
,2 ,"Engineering"
,3 ,"Human Resources"
,4 ,"Customer Service"
,5 ,"Research And Development"
,
)
# left_join
::left_join(tbl1,tbl2,by = "department_id") |>
dplyr::gt() |> gt::tab_header(title = "Left Join") |>
gt::gt_theme_espn() gtExtras
Left Join | |||
---|---|---|---|
id | employee_name | department_id | department_name |
1 | Homer Simpson | 4 | Customer Service |
2 | Ned Flanders | 1 | Sales |
3 | Barney Gumble | 5 | Research And Development |
4 | Clancy Wiggum | 3 | Human Resources |
5 | Moe Syzslak | NA | NA |
# right_join
::right_join(tbl1,tbl2,by = "department_id") |>
dplyr::gt() |> gt::tab_header(title = "Right Join") |>
gt::gt_theme_espn() gtExtras
Right Join | |||
---|---|---|---|
id | employee_name | department_id | department_name |
1 | Homer Simpson | 4 | Customer Service |
2 | Ned Flanders | 1 | Sales |
3 | Barney Gumble | 5 | Research And Development |
4 | Clancy Wiggum | 3 | Human Resources |
NA | NA | 2 | Engineering |
# inner_join
::inner_join(tbl1,tbl2,by = "department_id") |>
dplyr::gt() |> gt::tab_header(title = "Inner Join") |>
gt::gt_theme_espn() gtExtras
Inner Join | |||
---|---|---|---|
id | employee_name | department_id | department_name |
1 | Homer Simpson | 4 | Customer Service |
2 | Ned Flanders | 1 | Sales |
3 | Barney Gumble | 5 | Research And Development |
4 | Clancy Wiggum | 3 | Human Resources |
Q-4
Recall that the first step of a decision-tree regression model will divide the space of predictors into 2 parts and estimate constant prediction values for each part. For a single predictor, the result of the first step estimates the outcome as:
such that
is minimized.
On the first split of a decision tree regression model for the following data:
The first two regions that partition will be (Delete the wrong answer(s) below):
Q-5
In an ordinary linear regression, regressing the outcome on a single predictor , the regression coefficient can be estimated as:
Quiz-1 (part 2)
Q6
Write code to determine the number of species of penguin in the dataset. How many are there?
Q7
Execute the following code to read sales data from a csv file.
# read sales data
<-
sales_dat ::read_csv("data/sales_data_sample.csv", show_col_types = FALSE) |>
readr::clean_names() |>
janitor::mutate(
dplyrorderdate = lubridate::as_date(orderdate, format = "%m/%d/%Y %H:%M")
orderdate = lubridate::year(orderdate)
, )
Describe what the group_by
step does in the code below, and complete the code to produce a sales summary by year, i.e. a data.frame where productline
and orderdate
are the columns (one column for each year), while each year column contains the sales for each productline
that year.
|>
sales_dat ::group_by(orderdate, productline) |>
dplyr::summarize( sales = sum(___) ) |>
dplyr::pivot_wider(names_from = ___, values_from = ___) tidyr
Q8
For the data below, it is expected that the response variable can be described by the independent variables and . This implies that the parameters of the following model should be estimated and tested per the model:
<- tibble::tibble(
dat x1=c(0.58, 0.86, 0.29, 0.20, 0.56, 0.28, 0.08, 0.41, 0.22, 0.35, 0.59, 0.22, 0.26, 0.12, 0.65, 0.70, 0.30
0.70, 0.39, 0.72, 0.45, 0.81, 0.04, 0.20, 0.95)
, x2=c(0.71, 0.13, 0.79, 0.20, 0.56, 0.92, 0.01, 0.60, 0.70, 0.73, 0.13, 0.96, 0.27, 0.21, 0.88, 0.30
, 0.15, 0.09, 0.17, 0.25, 0.30, 0.32, 0.82, 0.98, 0.00)
, y=c(1.45, 1.93, 0.81, 0.61, 1.55, 0.95, 0.45, 1.14, 0.74, 0.98, 1.41, 0.81, 0.89, 0.68, 1.39, 1.53
, 0.91, 1.49, 1.38, 1.73, 1.11, 1.68, 0.66, 0.69, 1.98)
, )
Calculate the parameter estimates ( , , and ); in addition find the usual 95% confidence intervals for , , .
Q9
Using the .resid
column created by broom::augment(___, dat)
, calculate .
Q10
Does the following code train a model on the full training set of the modeldata::ames
housing dataset and then evaluate the model using a test set?
Is any step missing?
When the recipe is baked and prepped, do you think all categories will be converted to dummy variables and all numeric predictors will be normalized?
Grading (10 pts)
Part | Points |
---|---|
Part 1 - Conceptual | 5 |
Part 2 - Applied | 5 |
Total | 10 |