#load required libaries
library(spotifyr) #API interaction
library(tidyverse)
library(tidymodels)
library(rsample)
library(recipes)
library(caret)
Comparing competing Machine Learning models in classifying Spotify songs
In this project, I explored four popular machine learning models (K-Nearest Neighbors, Decision Tree, Bagged Tree, and Random Forest) to classify Spotify songs into two genres: Underground Rap and Drum and Bass (dnb). I used a dataset from the Spotify API, containing 18 audio features of each song. The goal was to determine the best model for this classification task based on accuracy, sensitivity, and specificity.
Model Comparison
Implemented and tuned each model using grid search and cross-validation
Evaluated model performance using accuracy, sensitivity, specificity, and F1 score
Visualized model performance using confusion matrices and ROC-AUC curves
Results
Random Forest model achieved the highest accuracy (92.1%) and F1 score (0.921)
Bagged Tree model showed the highest sensitivity (0.943)
Decision Tree model had the highest specificity (0.933)
Data Preparation
# read the online spotify data
<- read_csv("genres_v2.csv")
genre <- read_csv("playlists.csv")
playlist
#jon the data with id from genre and playlist
<- genre %>%
spotify_data left_join(playlist, by =c("id"= "Playlist"))
# filter the data to include only the tracks of the two genres you'd like to use for the classification task
<- spotify_data %>%
spotify_data filter(genre %in% c("Underground Rap", "dnb")) %>%
#rename underground rap to rap
mutate(genre = ifelse(genre == "Underground Rap", "URap", genre))
Data Exploration
# remove columns that you won't be feeding model
<- spotify_data %>% select(song_name, genre, danceability: tempo) %>%
spotify #change genre to factor
mutate(genre = as.factor(genre))
#find the top 10 most danaceable tracks in the dataset
<- spotify %>%
top_20_songs arrange(desc(danceability)) %>%
head(20) %>%
select(song_name, genre, danceability)
# output table with interactivity features
:: kable(top_20_songs,
kableExtracaption = "Top 20 most danceable tracks in the dataset")
song_name | genre | danceability |
---|---|---|
POP, LOCK & DROPDEAD | URap | 0.985 |
LoyaltyRunsDeepInDaLongRun | URap | 0.985 |
Two Left Feet Flow | URap | 0.984 |
Hate Your Guts | URap | 0.983 |
Mugen Woe | URap | 0.982 |
The 3 | URap | 0.980 |
Mavericks | URap | 0.979 |
Technicolor | URap | 0.977 |
Killmonger | URap | 0.977 |
Funky Friday | URap | 0.975 |
Nervous | URap | 0.975 |
Go to the Sto | URap | 0.975 |
Bad Bad Bad (feat. Lil Baby) | URap | 0.974 |
Abraham Lincoln | URap | 0.974 |
Psycho Pass | URap | 0.973 |
Who the Fuck Is You | URap | 0.972 |
Dottin Up | URap | 0.971 |
Worst Day of My Life | URap | 0.971 |
Excalibur | URap | 0.970 |
Sexy | URap | 0.970 |
#difference between genres for some of the audio features
#---drop song name, not required for models
<- spotify %>% select(-song_name)
spotify
#setup manual colors
<- c("#7f58AF", "#64C5EB", "#E84D8A", "#FEB326", "lightblue")
colors
#plot the data
%>%
spotify group_by(genre) %>%
summarise(across(danceability:tempo, mean)) %>%
pivot_longer(cols = danceability:tempo, names_to = "audio_feature", values_to = "mean") %>%
ggplot(aes(x = genre, y = mean, fill = genre)) +
geom_col(position = "dodge") +
facet_wrap(~audio_feature, scales = "free") +
theme_classic() +
xlab("") +
ylab("") +
theme(legend.position = "none") +
scale_fill_manual(values = colors)+
#add caption
labs(caption = paste0(title = "Source: Spotify API")) +
#add title at the top center
ggtitle("Difference between genres for some of the audio features")
1.K-nearest neighbor
The k-nearest neighbors (KNN) method is a non-parametric classification algorithm used for recommendation systems. In the context of a Spotify dataset, the KNN algorithm can be employed to recommend songs or artists to a user based on the preferences of other users with similar listening histories. The algorithm calculates the distances between the target user and other users in the dataset based on their features, such as the artists or genres they have listened to. It then selects the k nearest neighbors, which are the users with the shortest distances to the target user, and recommends songs or artists that are popular among these neighbors.
#load the libraries specific for this model
library(dplyr)
library(ggplot2) #great plots
library(rsample) #data splitting
library(recipes) #data preprocessing
library(skimr) #data exploration
library(tidymodels) #re-entering tidymodel mode
library(kknn) #knn modeling
library(caTools) #for splitting the data
#split the data into training and testing set
set.seed(123)
= sample.split(spotify$genre, SplitRatio = 0.7)
split = initial_split(spotify)
spotify_split = subset(spotify, split == TRUE)
train = subset(spotify, split == FALSE) test
#specify the recipe
<- recipe(genre ~., data = train) %>%
knn_rec step_dummy(all_nominal(), -all_outcomes(), one_hot = T) %>%
step_normalize(all_numeric(), -all_outcomes())
## knn spec
<- nearest_neighbor(neighbors = 5) %>%
knn_spec set_engine("kknn") %>%
set_mode("classification")
#bake the data
#baked_train <- bake(knn_rec, train)
#apply to testing set
#baked_test <- bake(knn_rec, test)
Train the model with 5 folds validation . The model is then tuned with grid search to find the best value of k. The model is then fit to the testing set and the performance is evaluated with confusion matrix, precision, recall and f1 score.
# cross validation on the dataset
<- vfold_cv(train, v = 5)
cv_folds
# put all together in workflow
<- workflow() %>%
knn_workflow add_recipe(knn_rec) %>%
add_model(knn_spec)
#fit the resamples and carry out validation
<- knn_workflow %>%
knn_res fit_resamples(resamples = cv_folds,
control = control_resamples(save_pred = TRUE))
# find the best value of k
<- nearest_neighbor(neighbors = tune()) %>%
knn_spec_tune set_mode("classification") %>%
set_engine("kknn")
# define a new workflow
<- workflow() %>%
wf_knn_tune add_model(knn_spec_tune) %>%
add_recipe(knn_rec)
# tune the best model with grid search
<- wf_knn_tune %>%
fit_knn_cv tune_grid(cv_folds, grid = data.frame(neighbors = c(1, 5, seq(10, 100, 10))))
#plot the fit knn cv with changing value of neighbours
%>% collect_metrics() %>%
fit_knn_cv filter(.metric == "roc_auc") %>%
ggplot(aes(x = neighbors, y = mean)) +
geom_point() +
geom_line() +
labs(title = "Accuracy of KNN model with different values of k",
x = "Number of neighbors",
y = "Accuracy") +
theme_minimal()
# check the performance with collect_metrics
%>% collect_metrics() fit_knn_cv
# A tibble: 36 × 7
neighbors .metric .estimator mean n std_err .config
<dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 1 accuracy binary 0.965 5 0.00170 Preprocessor1_Model01
2 1 brier_class binary 0.0351 5 0.00170 Preprocessor1_Model01
3 1 roc_auc binary 0.965 5 0.00309 Preprocessor1_Model01
4 5 accuracy binary 0.968 5 0.00176 Preprocessor1_Model02
5 5 brier_class binary 0.0258 5 0.00118 Preprocessor1_Model02
6 5 roc_auc binary 0.989 5 0.00111 Preprocessor1_Model02
7 10 accuracy binary 0.968 5 0.000903 Preprocessor1_Model03
8 10 brier_class binary 0.0247 5 0.000804 Preprocessor1_Model03
9 10 roc_auc binary 0.993 5 0.000952 Preprocessor1_Model03
10 20 accuracy binary 0.967 5 0.00105 Preprocessor1_Model04
# ℹ 26 more rows
# create a final workflow
<- wf_knn_tune %>%
final_wf finalize_workflow(select_best(fit_knn_cv, metric= "accuracy"))
# fit the final model
<- final_wf %>% fit(data = train)
final_fit
#predict to testing set
<- final_fit %>% predict(new_data = test)
spotify_pred
# Write over 'final_fit' with this last_fit() approach
<- final_wf %>% last_fit(spotify_split)
final_fit
# Collect metrics on the test data!
%>% collect_metrics() final_fit
# A tibble: 3 × 4
.metric .estimator .estimate .config
<chr> <chr> <dbl> <chr>
1 accuracy binary 0.968 Preprocessor1_Model1
2 roc_auc binary 0.992 Preprocessor1_Model1
3 brier_class binary 0.0237 Preprocessor1_Model1
#bind genre from test data to the predicted data
<- spotify_pred %>% bind_cols(test)
bind_test_pred
#plot the confusion matrix
%>%
bind_test_pred conf_mat(truth = genre, estimate = .pred_class) %>%
autoplot(type = "heatmap") +
theme_minimal()+
#remove legend
theme(legend.position = "none")
#calculate precision, recall and f1 score
<- bind_test_pred %>%
knn_estimate conf_mat(truth = genre, estimate = .pred_class) %>%
summary() %>%
head(4) %>%
#rename .estimate to knn estimate
rename("knn estimate" = .estimate)
2.Decision tree
Decision trees uses CART algorithm to split the data into two or more homogeneous sets. The algorithm uses the Gini index to create the splits. The model is then tuned with grid search to find the best hyperparameters. The model is then fit to the testing set and the performance is evaluated with confusion matrix, precision, recall and f1 score, as for knn model.
# load the packages for decision trees
library(MASS)
library(doParallel)
library(vip)
# methods using in class
<- initial_split(spotify)
genre_split <- training(genre_split)
genre_train <- testing(genre_split) genre_test
##Preprocess the data
<- recipe(genre ~., data = genre_train) %>%
genre_rec step_dummy(all_nominal(), -all_outcomes(), one_hot = TRUE) %>%
step_normalize(all_numeric(), -all_outcomes())
#new spec, tell the model that we are tuning hypermeter
<- decision_tree(
tree_spec_tune cost_complexity = tune(),
tree_depth = tune(),
min_n = tune()) %>%
set_engine("rpart") %>%
set_mode("classification")
Use grid search to find the best hyperparameters and train on folded training set.
# grid search the hyperparameters tuning
<- grid_regular(cost_complexity(), tree_depth(), min_n(), levels = 5)
tree_grid
#set up k-fold cv and use the model on this data
= genre_train %>% vfold_cv(v = 10)
genre_cv ::registerDoParallel() #build trees in parallel
doParallel
# setup the grid search
<- tune_grid(tree_spec_tune, genre ~.,
tree_rs resamples = genre_cv,
grid = tree_grid,
metrics = metric_set(accuracy))
Use the workflow to finalize the model and also fit on the training set, and predicting on the test set.
#finalize the model that is cross validate and hyperparameter is tuned
<- finalize_model(tree_spec_tune, select_best(tree_rs))
final_tree
#similar functions here.
<- fit(final_tree, genre~., data = genre_train)
final_tree_fit
#last_fit() fits on training data (like fit()), but then also evaluates on the test data.
<- last_fit(final_tree, genre~., genre_split)
final_tree_result $.predictions final_tree_result
[[1]]
# A tibble: 2,211 × 6
.pred_dnb .pred_URap .row .pred_class genre .config
<dbl> <dbl> <int> <fct> <fct> <chr>
1 0 1 2 URap URap Preprocessor1_Model1
2 0 1 3 URap URap Preprocessor1_Model1
3 0 1 4 URap URap Preprocessor1_Model1
4 0.00157 0.998 6 URap URap Preprocessor1_Model1
5 0 1 7 URap URap Preprocessor1_Model1
6 0.00157 0.998 8 URap URap Preprocessor1_Model1
7 0 1 14 URap URap Preprocessor1_Model1
8 0.00157 0.998 16 URap URap Preprocessor1_Model1
9 0.00157 0.998 17 URap URap Preprocessor1_Model1
10 0.00157 0.998 20 URap URap Preprocessor1_Model1
# ℹ 2,201 more rows
#visualize the model
%>%
final_tree_fit vip(geom = "col", aesthetics = list(fill = "midnightblue", alpha = 0.8)) +
scale_y_continuous(expand = c(0,0))
# how accurate is this model on the test data?
%>% collect_metrics() final_tree_result
# A tibble: 3 × 4
.metric .estimator .estimate .config
<chr> <chr> <dbl> <chr>
1 accuracy binary 0.983 Preprocessor1_Model1
2 roc_auc binary 0.982 Preprocessor1_Model1
3 brier_class binary 0.0166 Preprocessor1_Model1
#plot the confusion matrix
%>%
final_tree_result collect_predictions() %>%
conf_mat(truth = genre, estimate = .pred_class) %>%
autoplot(type = "heatmap") +
theme_minimal()+
theme(legend.position = "none")
#plot precision, recall and f1 score
<- final_tree_result %>%
dt_estimate collect_predictions() %>%
conf_mat(truth = genre, estimate = .pred_class) %>%
summary() %>%
head(4) %>%
#rename .estimate to deicsion tree estimate
rename("decision tree estimate" = .estimate)
3.Bagged tree
Bagging is a method of ensemble learning that combines the predictions of multiple models to improve the overall performance. It is different from two previous model in the sense that it is collection of ML models. It works by training multiple models on different subsets of the training data and then combining their predictions to make a final prediction. The bagged tree uses decision trees as the base model. The model is then tuned with grid search to find the best hyperparameters. The model is then fit to the testing set and the performance is evaluated with confusion matrix, precision, recall and f1 score, as for knn model and the decision tree model.
# Helper packages
library(doParallel) # for parallel backend to foreach
library(foreach) # for parallel processing with for loops
library(caret) # for general model fitting
library(rpart) # for fitting decision trees
library(ipred) # for bagging
library(baguette) # for bagging
library(tidymodels) # Assuming you have tidymodels installed
# Methods using in class
<- initial_split(spotify)
genre_split <- training(genre_split)
genre_train <- testing(genre_split)
genre_test
## Preprocess the data
<- recipe(genre ~ ., data = genre_train) %>%
genre_rec step_dummy(all_nominal(), -all_outcomes(), one_hot = TRUE) %>%
step_normalize(all_numeric(), -all_outcomes(),)
# instatntiate bag model
<- bag_tree() %>%
bag_model set_engine("rpart", times = 100) %>%
set_mode("classification")
# create a workflow
<- workflow() %>%
bag_workflow add_model(bag_model) %>%
add_recipe(genre_rec)
##folds the data for validation set
= genre_train %>% vfold_cv(v = 10)
genre_cv
# use tune grid to tune the model
<- tune_grid(bag_workflow,
bag_tune resamples = genre_cv,
grid = 10)
# collect metrices
%>% collect_metrics() bag_tune
# A tibble: 3 × 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 accuracy binary 0.990 10 0.00156 Preprocessor1_Model1
2 brier_class binary 0.00833 10 0.00112 Preprocessor1_Model1
3 roc_auc binary 0.998 10 0.000354 Preprocessor1_Model1
#finalize the workflow
= show_best(bag_tune, n = 1, metric = "roc_auc")
bag_best
#fit the model
<- finalize_workflow(bag_workflow, select_best(bag_tune, metric = "roc_auc" ))
finalize_bag
# fit the finalized model
<- finalize_bag %>% fit(genre_train)
bag_fit
# predict the model on test
<- bag_fit %>% predict(genre_test) %>%
bag_pred bind_cols(genre_test)
#predict the model with probaility values
<- bag_fit %>% predict(genre_test, type = "prob") %>%
bag_pred_prob bind_cols(genre_test)
#model metrics and evaluation
accuracy(bag_pred, truth = genre, estimate = .pred_class)
# A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 accuracy binary 0.992
#visualize the model
%>%
bag_pred conf_mat(truth = genre, estimate = .pred_class) %>%
autoplot(type = "heatmap") +
theme_minimal()
#remove legend
theme(legend.position = "none")
List of 1
$ legend.position: chr "none"
- attr(*, "class")= chr [1:2] "theme" "gg"
- attr(*, "complete")= logi FALSE
- attr(*, "validate")= logi TRUE
#plot precision, recall and f1 score
<- bag_pred %>%
bag_estimate conf_mat(truth = genre, estimate = .pred_class) %>%
summary() %>%
head(4) %>%
#rename .estimate to bagged tree estimate
rename("bagged tree estimate" = .estimate)
4.Random Forest
Random forest is a popular ensemble learning method that combines the predictions of multiple decision trees to improve the overall performance. It is different from bagged tree in the sense that it uses a collection of decision trees as the base model, stochastically chosing number of columns too in training. That is not seen in general bagging model. The model is then tuned with grid search to find the best hyperparameters.
# random forest with R
library(ranger) # for random forest
# methods using in class
<- initial_split(spotify)
genre_split <- training(genre_split)
genre_train <- testing(genre_split)
genre_test
# fold the data for validation set
= vfold_cv(genre_train, v = 5)
cv_folds
# create a previous recipe
<- recipe(genre ~., data = genre_train) %>%
genre_recipe step_dummy(all_nominal(), -all_outcomes(), one_hot = TRUE) %>%
step_normalize(all_numeric(), -all_outcomes())
# instantiate model
<- rand_forest(mtry = tune(), trees = tune()) %>%
rf_model set_engine("ranger") %>%
set_mode("classification")
# create a workflow
<- workflow() %>%
rf_workflow add_model(rf_model) %>%
add_recipe(genre_recipe)
# use tune grid to tune the model
<- tune_grid(
rf_tune
rf_workflow,resamples = cv_folds,
grid = 10)
# collect metrices
%>% collect_metrics() rf_tune
# A tibble: 30 × 8
mtry trees .metric .estimator mean n std_err .config
<int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 10 1748 accuracy binary 0.991 5 0.00108 Preprocessor1_Mod…
2 10 1748 brier_class binary 0.00757 5 0.000706 Preprocessor1_Mod…
3 10 1748 roc_auc binary 0.999 5 0.000199 Preprocessor1_Mod…
4 7 69 accuracy binary 0.992 5 0.000983 Preprocessor1_Mod…
5 7 69 brier_class binary 0.00661 5 0.000573 Preprocessor1_Mod…
6 7 69 roc_auc binary 1.00 5 0.000158 Preprocessor1_Mod…
7 5 1242 accuracy binary 0.993 5 0.000805 Preprocessor1_Mod…
8 5 1242 brier_class binary 0.00665 5 0.000585 Preprocessor1_Mod…
9 5 1242 roc_auc binary 1.00 5 0.0000814 Preprocessor1_Mod…
10 6 464 accuracy binary 0.993 5 0.000873 Preprocessor1_Mod…
# ℹ 20 more rows
#finalize workflow
= show_best(rf_tune, n = 1, metric = "roc_auc")
rf_best
# finalize the model
<- finalize_workflow(rf_workflow, select_best(rf_tune, metric = "roc_auc" ))
final_rand_forest
#use this to predict the on test set
<- final_rand_forest %>% fit(genre_train)
final_rf_fit
# predict the model on test
<- final_rf_fit %>% predict(genre_test) %>%
final_rf_pred bind_cols(genre_test)
#model metrics and evaluation
accuracy(final_rf_pred, truth = genre, estimate = .pred_class)
# A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 accuracy binary 0.993
#visualize the model
%>%
final_rf_pred conf_mat(truth = genre, estimate = .pred_class) %>%
autoplot(type = "heatmap") +
theme_minimal()+
#remove legend
theme(legend.position = "none")
#plot precision, recall and f1 score
<- final_rf_pred %>%
rf_estimate conf_mat(truth = genre, estimate = .pred_class) %>%
summary() %>%
head(4) %>%
#rename .estimate to random forest estimate
rename("random forest estimate" = .estimate)
Model Comparison
#Compare the accuracy of all models and create a dataframe
#bind all the estimates using left join
<- knn_estimate %>%
estimate_table left_join(dt_estimate) %>%
left_join(bag_estimate) %>%
left_join(rf_estimate)
estimate_table
# A tibble: 4 × 6
.metric .estimator `knn estimate` `decision tree estimate`
<chr> <chr> <dbl> <dbl>
1 accuracy binary 0.970 0.983
2 kap binary 0.933 0.963
3 sens binary 0.970 0.976
4 spec binary 0.970 0.987
# ℹ 2 more variables: `bagged tree estimate` <dbl>,
# `random forest estimate` <dbl>
## plot the histogram of the accuracy of all models
%>%
estimate_table pivot_longer(cols = c("knn estimate", "decision tree estimate", "bagged tree estimate", "random forest estimate"), names_to = "model", values_to = "accuracy") %>%
ggplot(aes(x = model, y = accuracy, fill = model)) +
geom_col(position = "dodge") +
theme_minimal() +
xlab("") +
ylab("accuracy") +
theme(legend.position = "none") +
labs(caption = paste0(title = "Source: Spotify API")) +
ggtitle("Accuracy of all models") +
#transform y axis to percentage multiplying by 100
scale_y_continuous(labels = scales::percent)+
#fill manual colors
scale_fill_manual(values = colors)
Conclusion
This project demonstrated the effectiveness of ensemble learning methods (Bagged Tree and Random Forest) in classifying Spotify songs into two genres. The results can be used to inform music recommendation systems or genre classification tasks. I showcased my skills in data preprocessing, model implementation, hyperparameter tuning, and performance evaluation.