diff --git a/DESCRIPTION b/DESCRIPTION index 15ac8c28..2ab89d68 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: worldfootballR Title: Extract and Clean World Football (Soccer) Data -Version: 0.6.4.0014 +Version: 0.6.5 Authors@R: c( person("Jason", "Zivkovic", , "jaseziv83@gmail.com", role = c("aut", "cre", "cph")), person("Tony", "ElHabr", , "anthonyelhabr@gmail.com", role = "ctb"), diff --git a/NAMESPACE b/NAMESPACE index ee9afc39..99f785a7 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -43,6 +43,7 @@ export(load_understat_league_shots) export(player_dictionary_mapping) export(player_transfer_history) export(tm_expiring_contracts) +export(tm_get_player_absence) export(tm_league_debutants) export(tm_league_injuries) export(tm_league_team_urls) @@ -68,11 +69,13 @@ export(understat_team_players_stats) export(understat_team_season_shots) export(understat_team_stats_breakdown) importFrom(cli,cli_alert) +importFrom(dplyr,across) importFrom(dplyr,bind_cols) importFrom(dplyr,bind_rows) importFrom(dplyr,case_when) importFrom(dplyr,filter) importFrom(dplyr,mutate) +importFrom(dplyr,mutate_all) importFrom(dplyr,pull) importFrom(dplyr,select) importFrom(dplyr,transmute) @@ -80,6 +83,7 @@ importFrom(httr,GET) importFrom(httr,add_headers) importFrom(httr,content) importFrom(httr,set_cookies) +importFrom(janitor,clean_names) importFrom(jsonlite,fromJSON) importFrom(lubridate,today) importFrom(lubridate,ymd) @@ -103,6 +107,7 @@ importFrom(rstudioapi,isAvailable) importFrom(rstudioapi,versionInfo) importFrom(rvest,html_attr) importFrom(rvest,html_children) +importFrom(rvest,html_elements) importFrom(rvest,html_node) importFrom(rvest,html_nodes) importFrom(rvest,html_table) diff --git a/NEWS.md b/NEWS.md index 694cad6c..45894344 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,9 @@ # worldfootballR (development version) +*** + +# worldfootballR 0.6.5 + ### Bugs * `fb_league_stats()` failing for `playing_time`. (0.6.4.0001) [#314](https://github.com/JaseZiv/worldfootballR/issues/314) @@ -20,6 +24,7 @@ * `fb_team_match_stats()` and `understat_available_teams()` (0.6.4.0004) * `fb_match_shooting()`, `fb_advanced_match_stats()`, `fb_league_stats(team_or_player = "player")` gain `Player_Href` column (0.6.4.0005) * `load_fb_advanced_stats()` and `load_fb_match_summary()` added (0.6.4.0007) +* `tm_get_player_absence()` now available to get a list of absences through suspension for players from transfermarkt *** diff --git a/R/tm_player_absence.R b/R/tm_player_absence.R new file mode 100644 index 00000000..68c4062b --- /dev/null +++ b/R/tm_player_absence.R @@ -0,0 +1,143 @@ +#' @importFrom rvest html_elements html_text html_table html_attr +#' @importFrom xml2 read_html +#' @importFrom purrr pluck +#' @importFrom janitor clean_names +#' @importFrom dplyr mutate across mutate_all +.tm_each_absence_page <- function(absence_page_url) { + + absence_pg <- xml2::read_html(absence_page_url) + + # get the main table, knowing that come columns won't be returned + main_df <- absence_pg |> rvest::html_elements("#yw1 .items") |> rvest::html_table() |> data.frame() + + if(nrow(main_df) > 0) { + + # create an object of each table row and the col headings + tab_rows <- absence_pg |> rvest::html_elements("#yw1 .items tbody tr") + tab_head <- absence_pg |> rvest::html_elements("#yw1 .items tr th") |> rvest::html_text() + + # index of columns we need to get extra html elements for + competition_idx <- grep("competition", tolower(tab_head)) + club_missed_idx <- grep("games missed", tolower(tab_head)) + + + # parse competiton name + comp_name <- c() + for(i in 1:length(tab_rows)) { + each <- tab_rows[i] |> + rvest::html_elements("td") |> + purrr::pluck(competition_idx) |> + rvest::html_elements("img") |> rvest::html_attr("title") |> + .replace_empty_na() + + comp_name <- c(comp_name, each) + } + + + # parse team name + club_name <- c() + for(i in 1:length(tab_rows)) { + each <- tab_rows[i] |> + rvest::html_elements("td") |> + purrr::pluck(club_missed_idx) |> + rvest::html_elements("a") |> rvest::html_attr("title") |> + .replace_empty_na() + + club_name <- c(club_name, each) + } + + + main_df$Competition <- comp_name + main_df$club_missed <- club_name + + main_df <- main_df |> + dplyr::mutate_all(as.character) |> + dplyr::mutate(dplyr::across(c("from", "until"), .tm_fix_dates)) |> + janitor::clean_names() + + } + + return(main_df) + +} + + + +#' Get Player Absences +#' +#' Returns data frame of a player's absences from suspension from transfermarkt.com +#' +#' @param player_urls player url(s) from transfermarkt +#' +#' @return returns a dataframe +#' +#' @importFrom rlang .data +#' +#' @export +#' +#' @examples +#' \dontrun{ +#' try({ +#' player_urls <- c("https://www.transfermarkt.com/cristian-romero/profil/spieler/355915", +#' "https://www.transfermarkt.com/micky-van-de-ven/profil/spieler/557459") +#' +#' df <- tm_get_player_absence(player_urls) +#' }) +#' } +#' +tm_get_player_absence <- function(player_urls) { + + + .tm_each_players_absence <- function(player_url) { + + pb$tick() + + main_url <- "https://www.transfermarkt.com" + + # # change the url to point to the absences url + player_url_changed <- gsub("profil", "ausfaelle", player_url) + + + + player_page <- xml2::read_html(player_url_changed) + + player_name <- player_page %>% rvest::html_nodes("h1") %>% rvest::html_text() %>% gsub("#[0-9]+ ", "", .) %>% stringr::str_squish() + player_meta <- data.frame(player_name = player_name, + player_url = player_url) + + absence_paginated <- player_page |> + rvest::html_elements(".tm-pagination__list-item a") |> rvest::html_attr("href") |> + unique() + + if(length(absence_paginated) == 0) { + absence_paginated <- player_url_changed + } else { + absence_paginated <- paste0(main_url, absence_paginated) + } + + + f_possibly <- purrr::possibly(.tm_each_absence_page, otherwise = data.frame(), quiet = FALSE) + player_out <- purrr::map_dfr( + absence_paginated, + f_possibly + ) + + player_out <- dplyr::bind_cols(player_meta, player_out) + + return(player_out) + + } + + # create the progress bar with a progress function. + pb <- progress::progress_bar$new(total = length(player_urls)) + + f_possibly2 <- purrr::possibly(.tm_each_players_absence, otherwise = data.frame(), quiet = FALSE) + purrr::map_dfr( + player_urls, + f_possibly2 + ) + +} + + + diff --git a/man/tm_get_player_absence.Rd b/man/tm_get_player_absence.Rd new file mode 100644 index 00000000..6d21bb54 --- /dev/null +++ b/man/tm_get_player_absence.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tm_player_absence.R +\name{tm_get_player_absence} +\alias{tm_get_player_absence} +\title{Get Player Absences} +\usage{ +tm_get_player_absence(player_urls) +} +\arguments{ +\item{player_urls}{player url(s) from transfermarkt} +} +\value{ +returns a dataframe +} +\description{ +Returns data frame of a player's absences from suspension from transfermarkt.com +} +\examples{ +\dontrun{ +try({ +player_urls <- c("https://www.transfermarkt.com/cristian-romero/profil/spieler/355915", +"https://www.transfermarkt.com/micky-van-de-ven/profil/spieler/557459") + +df <- tm_get_player_absence(player_urls) +}) +} + +} diff --git a/tests/testthat/test-transfermarkt.R b/tests/testthat/test-transfermarkt.R index 9e2c7160..acf7fb8e 100644 --- a/tests/testthat/test-transfermarkt.R +++ b/tests/testthat/test-transfermarkt.R @@ -238,3 +238,17 @@ test_that("tm_player_injury_history() works", { expect_false(nrow(hazard_injuries) == 0) }) + + +test_that("tm_get_player_absence() works", { + testthat::skip_on_cran() + + player_absence <- tm_get_player_absence(player_urls = c("https://www.transfermarkt.com/cristian-romero/profil/spieler/355915", + "https://www.transfermarkt.com/micky-van-de-ven/profil/spieler/557459")) + expect_type(player_absence, "list") + expect_equal(ncol(player_absence), 10) + expect_false(nrow(player_absence) == 0) + +}) + + diff --git a/vignettes/extract-transfermarkt-data.Rmd b/vignettes/extract-transfermarkt-data.Rmd index fa41e051..7c03981a 100644 --- a/vignettes/extract-transfermarkt-data.Rmd +++ b/vignettes/extract-transfermarkt-data.Rmd @@ -297,7 +297,7 @@ dplyr::glimpse(hazard_injuries) #----- for multiple players: -----# # # can make use of a tm helper function: # burnley_player_urls <- tm_team_player_urls(team_url = "https://www.transfermarkt.com/fc-burnley/startseite/verein/1132/saison_id/2021") -# # then pass all those URLs to the tm_player_bio +# # then pass all those URLs to the tm_player_injury_history # burnley_player_injuries <- tm_player_injury_history(player_urls = burnley_player_urls) ``` @@ -325,6 +325,25 @@ dplyr::glimpse(all_leeds_united_players_transfer_history) ``` + +### Player Absence + +To be able to get a player's (or players') absence history as a result of suspensions from transfermarkt, use the `tm_get_player_absence()` function. + +```{r player_absence, eval=FALSE} +#----- for a single player: -----# +romero_absence <- tm_get_player_absence(player_urls = "https://www.transfermarkt.com/cristian-romero/profil/spieler/355915") +dplyr::glimpse(romero_absence) + +#----- for multiple players: -----# +# # can make use of a tm helper function: +spurs_player_urls <- tm_team_player_urls(team_url = "https://www.transfermarkt.com/tottenham-hotspur/startseite/verein/148") +# # then pass all those URLs to the tm_get_player_absence +spurs_player_absence <- tm_get_player_absence(player_urls = spurs_player_urls) +``` + + + ***