Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/tm absence #353

Merged
merged 6 commits into from
Jan 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: worldfootballR
Title: Extract and Clean World Football (Soccer) Data
Version: 0.6.4.0014
Version: 0.6.5
Authors@R: c(
person("Jason", "Zivkovic", , "[email protected]", role = c("aut", "cre", "cph")),
person("Tony", "ElHabr", , "[email protected]", role = "ctb"),
Expand Down
5 changes: 5 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ export(load_understat_league_shots)
export(player_dictionary_mapping)
export(player_transfer_history)
export(tm_expiring_contracts)
export(tm_get_player_absence)
export(tm_league_debutants)
export(tm_league_injuries)
export(tm_league_team_urls)
Expand All @@ -68,18 +69,21 @@ export(understat_team_players_stats)
export(understat_team_season_shots)
export(understat_team_stats_breakdown)
importFrom(cli,cli_alert)
importFrom(dplyr,across)
importFrom(dplyr,bind_cols)
importFrom(dplyr,bind_rows)
importFrom(dplyr,case_when)
importFrom(dplyr,filter)
importFrom(dplyr,mutate)
importFrom(dplyr,mutate_all)
importFrom(dplyr,pull)
importFrom(dplyr,select)
importFrom(dplyr,transmute)
importFrom(httr,GET)
importFrom(httr,add_headers)
importFrom(httr,content)
importFrom(httr,set_cookies)
importFrom(janitor,clean_names)
importFrom(jsonlite,fromJSON)
importFrom(lubridate,today)
importFrom(lubridate,ymd)
Expand All @@ -103,6 +107,7 @@ importFrom(rstudioapi,isAvailable)
importFrom(rstudioapi,versionInfo)
importFrom(rvest,html_attr)
importFrom(rvest,html_children)
importFrom(rvest,html_elements)
importFrom(rvest,html_node)
importFrom(rvest,html_nodes)
importFrom(rvest,html_table)
Expand Down
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# worldfootballR (development version)

***

# worldfootballR 0.6.5

### Bugs

* `fb_league_stats()` failing for `playing_time`. (0.6.4.0001) [#314](https://github.com/JaseZiv/worldfootballR/issues/314)
Expand All @@ -20,6 +24,7 @@
* `fb_team_match_stats()` and `understat_available_teams()` (0.6.4.0004)
* `fb_match_shooting()`, `fb_advanced_match_stats()`, `fb_league_stats(team_or_player = "player")` gain `Player_Href` column (0.6.4.0005)
* `load_fb_advanced_stats()` and `load_fb_match_summary()` added (0.6.4.0007)
* `tm_get_player_absence()` now available to get a list of absences through suspension for players from transfermarkt


***
Expand Down
143 changes: 143 additions & 0 deletions R/tm_player_absence.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
#' @importFrom rvest html_elements html_text html_table html_attr
#' @importFrom xml2 read_html
#' @importFrom purrr pluck
#' @importFrom janitor clean_names
#' @importFrom dplyr mutate across mutate_all
.tm_each_absence_page <- function(absence_page_url) {

absence_pg <- xml2::read_html(absence_page_url)

# get the main table, knowing that come columns won't be returned
main_df <- absence_pg |> rvest::html_elements("#yw1 .items") |> rvest::html_table() |> data.frame()

if(nrow(main_df) > 0) {

# create an object of each table row and the col headings
tab_rows <- absence_pg |> rvest::html_elements("#yw1 .items tbody tr")
tab_head <- absence_pg |> rvest::html_elements("#yw1 .items tr th") |> rvest::html_text()

# index of columns we need to get extra html elements for
competition_idx <- grep("competition", tolower(tab_head))
club_missed_idx <- grep("games missed", tolower(tab_head))


# parse competiton name
comp_name <- c()
for(i in 1:length(tab_rows)) {
each <- tab_rows[i] |>
rvest::html_elements("td") |>
purrr::pluck(competition_idx) |>
rvest::html_elements("img") |> rvest::html_attr("title") |>
.replace_empty_na()

comp_name <- c(comp_name, each)
}


# parse team name
club_name <- c()
for(i in 1:length(tab_rows)) {
each <- tab_rows[i] |>
rvest::html_elements("td") |>
purrr::pluck(club_missed_idx) |>
rvest::html_elements("a") |> rvest::html_attr("title") |>
.replace_empty_na()

club_name <- c(club_name, each)
}


main_df$Competition <- comp_name
main_df$club_missed <- club_name

main_df <- main_df |>
dplyr::mutate_all(as.character) |>
dplyr::mutate(dplyr::across(c("from", "until"), .tm_fix_dates)) |>
janitor::clean_names()

}

return(main_df)

}



#' Get Player Absences
#'
#' Returns data frame of a player's absences from suspension from transfermarkt.com
#'
#' @param player_urls player url(s) from transfermarkt
#'
#' @return returns a dataframe
#'
#' @importFrom rlang .data
#'
#' @export
#'
#' @examples
#' \dontrun{
#' try({
#' player_urls <- c("https://www.transfermarkt.com/cristian-romero/profil/spieler/355915",
#' "https://www.transfermarkt.com/micky-van-de-ven/profil/spieler/557459")
#'
#' df <- tm_get_player_absence(player_urls)
#' })
#' }
#'
tm_get_player_absence <- function(player_urls) {


.tm_each_players_absence <- function(player_url) {

pb$tick()

main_url <- "https://www.transfermarkt.com"

# # change the url to point to the absences url
player_url_changed <- gsub("profil", "ausfaelle", player_url)



player_page <- xml2::read_html(player_url_changed)

player_name <- player_page %>% rvest::html_nodes("h1") %>% rvest::html_text() %>% gsub("#[0-9]+ ", "", .) %>% stringr::str_squish()
player_meta <- data.frame(player_name = player_name,
player_url = player_url)

absence_paginated <- player_page |>
rvest::html_elements(".tm-pagination__list-item a") |> rvest::html_attr("href") |>
unique()

if(length(absence_paginated) == 0) {
absence_paginated <- player_url_changed
} else {
absence_paginated <- paste0(main_url, absence_paginated)
}


f_possibly <- purrr::possibly(.tm_each_absence_page, otherwise = data.frame(), quiet = FALSE)
player_out <- purrr::map_dfr(
absence_paginated,
f_possibly
)

player_out <- dplyr::bind_cols(player_meta, player_out)

return(player_out)

}

# create the progress bar with a progress function.
pb <- progress::progress_bar$new(total = length(player_urls))

f_possibly2 <- purrr::possibly(.tm_each_players_absence, otherwise = data.frame(), quiet = FALSE)
purrr::map_dfr(
player_urls,
f_possibly2
)

}



28 changes: 28 additions & 0 deletions man/tm_get_player_absence.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 14 additions & 0 deletions tests/testthat/test-transfermarkt.R
Original file line number Diff line number Diff line change
Expand Up @@ -238,3 +238,17 @@ test_that("tm_player_injury_history() works", {
expect_false(nrow(hazard_injuries) == 0)

})


test_that("tm_get_player_absence() works", {
testthat::skip_on_cran()

player_absence <- tm_get_player_absence(player_urls = c("https://www.transfermarkt.com/cristian-romero/profil/spieler/355915",
"https://www.transfermarkt.com/micky-van-de-ven/profil/spieler/557459"))
expect_type(player_absence, "list")
expect_equal(ncol(player_absence), 10)
expect_false(nrow(player_absence) == 0)

})


21 changes: 20 additions & 1 deletion vignettes/extract-transfermarkt-data.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,7 @@ dplyr::glimpse(hazard_injuries)
#----- for multiple players: -----#
# # can make use of a tm helper function:
# burnley_player_urls <- tm_team_player_urls(team_url = "https://www.transfermarkt.com/fc-burnley/startseite/verein/1132/saison_id/2021")
# # then pass all those URLs to the tm_player_bio
# # then pass all those URLs to the tm_player_injury_history
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

drive by fix

# burnley_player_injuries <- tm_player_injury_history(player_urls = burnley_player_urls)
```

Expand Down Expand Up @@ -325,6 +325,25 @@ dplyr::glimpse(all_leeds_united_players_transfer_history)
```



### Player Absence

To be able to get a player's (or players') absence history as a result of suspensions from transfermarkt, use the `tm_get_player_absence()` function.

```{r player_absence, eval=FALSE}
#----- for a single player: -----#
romero_absence <- tm_get_player_absence(player_urls = "https://www.transfermarkt.com/cristian-romero/profil/spieler/355915")
dplyr::glimpse(romero_absence)

#----- for multiple players: -----#
# # can make use of a tm helper function:
spurs_player_urls <- tm_team_player_urls(team_url = "https://www.transfermarkt.com/tottenham-hotspur/startseite/verein/148")
# # then pass all those URLs to the tm_get_player_absence
spurs_player_absence <- tm_get_player_absence(player_urls = spurs_player_urls)
```



***


Expand Down
Loading