12.1 PCA
rec <- recipes::recipe(Sale_Price ~ .,
data = ames) %>%
recipes::step_center(all_numeric()) %>%
recipes::step_scale(all_numeric())
12.1.1 Prep/bake
prep
: trains a datasetbake
: apply a trained model to new data
However, you do not need to do this since the fit
functions do this for you too
rec %>%
recipes::prep(training = ames) %>%
recipes::bake(new_data = ames)
## # A tibble: 2,930 x 81
## MS_SubClass MS_Zoning Lot_Frontage Lot_Area Street Alley Lot_Shape
## <fct> <fct> <dbl> <dbl> <fct> <fct> <fct>
## 1 One_Story_… Resident… 2.49 2.74 Pave No_A… Slightly…
## 2 One_Story_… Resident… 0.667 0.187 Pave No_A… Regular
## 3 One_Story_… Resident… 0.697 0.523 Pave No_A… Slightly…
## 4 One_Story_… Resident… 1.06 0.128 Pave No_A… Regular
## 5 Two_Story_… Resident… 0.488 0.467 Pave No_A… Slightly…
## 6 Two_Story_… Resident… 0.608 -0.0216 Pave No_A… Slightly…
## 7 One_Story_… Resident… -0.497 -0.663 Pave No_A… Regular
## 8 One_Story_… Resident… -0.437 -0.653 Pave No_A… Slightly…
## 9 One_Story_… Resident… -0.557 -0.604 Pave No_A… Slightly…
## 10 Two_Story_… Resident… 0.0702 -0.336 Pave No_A… Regular
## # … with 2,920 more rows, and 74 more variables: Land_Contour <fct>,
## # Utilities <fct>, Lot_Config <fct>, Land_Slope <fct>, Neighborhood <fct>,
## # Condition_1 <fct>, Condition_2 <fct>, Bldg_Type <fct>, House_Style <fct>,
## # Overall_Qual <fct>, Overall_Cond <fct>, Year_Built <dbl>,
## # Year_Remod_Add <dbl>, Roof_Style <fct>, Roof_Matl <fct>,
## # Exterior_1st <fct>, Exterior_2nd <fct>, Mas_Vnr_Type <fct>,
## # Mas_Vnr_Area <dbl>, Exter_Qual <fct>, Exter_Cond <fct>, Foundation <fct>,
## # Bsmt_Qual <fct>, Bsmt_Cond <fct>, Bsmt_Exposure <fct>,
## # BsmtFin_Type_1 <fct>, BsmtFin_SF_1 <dbl>, BsmtFin_Type_2 <fct>,
## # BsmtFin_SF_2 <dbl>, Bsmt_Unf_SF <dbl>, Total_Bsmt_SF <dbl>, Heating <fct>,
## # Heating_QC <fct>, Central_Air <fct>, Electrical <fct>, First_Flr_SF <dbl>,
## # Second_Flr_SF <dbl>, Low_Qual_Fin_SF <dbl>, Gr_Liv_Area <dbl>,
## # Bsmt_Full_Bath <dbl>, Bsmt_Half_Bath <dbl>, Full_Bath <dbl>,
## # Half_Bath <dbl>, Bedroom_AbvGr <dbl>, Kitchen_AbvGr <dbl>,
## # Kitchen_Qual <fct>, TotRms_AbvGrd <dbl>, Functional <fct>,
## # Fireplaces <dbl>, Fireplace_Qu <fct>, Garage_Type <fct>,
## # Garage_Finish <fct>, Garage_Cars <dbl>, Garage_Area <dbl>,
## # Garage_Qual <fct>, Garage_Cond <fct>, Paved_Drive <fct>,
## # Wood_Deck_SF <dbl>, Open_Porch_SF <dbl>, Enclosed_Porch <dbl>,
## # Three_season_porch <dbl>, Screen_Porch <dbl>, Pool_Area <dbl>,
## # Pool_QC <fct>, Fence <fct>, Misc_Feature <fct>, Misc_Val <dbl>,
## # Mo_Sold <dbl>, Year_Sold <dbl>, Sale_Type <fct>, Sale_Condition <fct>,
## # Longitude <dbl>, Latitude <dbl>, Sale_Price <dbl>
12.1.2 Dummy variables
- aka one-hot encoding
- You don’t need this for decision trees or ensembles of trees
12.1.3 Step novel
- A catch all for new categories that the model may not have trained on
- Do this before dummy encoding
rec %>%
recipes::step_novel(all_nominal()) %>%
recipes::step_dummy(all_nominal())
## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 80
##
## Operations:
##
## Centering for all_numeric
## Scaling for all_numeric
## Novel factor level assignment for all_nominal
## Dummy variables from all_nominal
12.1.4 remove 0 variance
- Remove columns where there is only 1 value in it
rec %>%
recipes::step_novel(all_nominal()) %>%
recipes::step_dummy(all_nominal()) %>%
recipes::step_zv(all_predictors())
## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 80
##
## Operations:
##
## Centering for all_numeric
## Scaling for all_numeric
## Novel factor level assignment for all_nominal
## Dummy variables from all_nominal
## Zero variance filter on all_predictors
12.1.5 PCA
rec %>%
recipes::step_novel(all_nominal()) %>%
recipes::step_dummy(all_nominal()) %>%
recipes::step_zv(all_predictors()) %>%
recipes::step_pca(all_numeric(), num_comp = 5)
## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 80
##
## Operations:
##
## Centering for all_numeric
## Scaling for all_numeric
## Novel factor level assignment for all_nominal
## Dummy variables from all_nominal
## Zero variance filter on all_predictors
## No PCA components were extracted.