From 83a7a306f50363b1e6ab48db14dcc86376b2b0f4 Mon Sep 17 00:00:00 2001 From: Tony ElHabr Date: Wed, 22 Nov 2023 18:54:16 -0600 Subject: [PATCH 1/3] Create tm_player_transfer_history.R --- R/tm_player_transfer_history.R | 147 +++++++++++++++++++++++++++++++++ 1 file changed, 147 insertions(+) create mode 100644 R/tm_player_transfer_history.R diff --git a/R/tm_player_transfer_history.R b/R/tm_player_transfer_history.R new file mode 100644 index 00000000..5d4cb740 --- /dev/null +++ b/R/tm_player_transfer_history.R @@ -0,0 +1,147 @@ + +.standardize_fee <- function(x) { + case_when( + grepl("free", x) ~ "Free transfer", + grepl("end of", x) ~ "End of loan", + grepl("fee", x) ~ "Paid loan", + grepl("loan", x) ~ "Loan", + grepl("m|th.|k", x) ~ "Transfer", + TRUE ~ NA_character_ + ) +} + +tm_player_transfer_history2 <- function(player_urls, get_extra_info = TRUE) { + + player_url <- "https://www.transfermarkt.com/jack-rodwell/profil/spieler/57079" + get_extra_info <- TRUE + + single_player_transfer_history <- function(player_url) { + pb$tick() + + main_url <- "https://www.transfermarkt.us.com" + + page <- xml2::read_html(player_url) + + player_name <- page %>% + rvest::html_node("h1") %>% + rvest::html_text() %>% + gsub("#[[:digit:]]+ ", "", .) %>% + stringr::str_squish() + + player_id <- basename(player_url) + transfer_history_resp <- httr::GET( + url = paste0("https://www.transfermarkt.com/ceapi/transferHistory/list/", player_id), + httr::add_headers(.headers = c( + `User-Agent` = getOption("worldfootballR.agent") + )) + ) + + transfer_history_content <- content(transfer_history_resp) + raw_transfer_history <- transfer_history_content[["transfers"]] + + formatted_transfer_history <- purrr::map( + raw_transfer_history, + \(.x) { + list( + "url" = .x[["url"]], + "season" = .x[["season"]], + "transfer_date" = lubridate::ymd(.x[["dateUnformatted"]]), + "team_from" = .x[["from"]][["clubName"]], + "team_to" = .x[["to"]][["clubName"]], + "market_value" = .convert_value_to_numeric(.x[["marketValue"]]), + "fee" = .standardize_fee(.x[["fee"]]) + ) + } + ) + + # Executed if the user wants to get more info. Contains: Contract Expiring date + Days remaining. From which countries was the transfer. + if (isTRUE(get_extra_info)) { + res <- purrr::map( + formatted_transfer_history, + \(.x) { + url <- .x$url + extra_info <- tryCatch(xml2::read_html(url), error = function(e) NA) + contract_box <- extra_info %>% + rvest::html_nodes(".large-4.columns") %>% + rvest::html_node("table") %>% + rvest::html_children() + + contract_idx <- contract_box %>% + rvest::html_text() %>% + grep("Remaining contract duration", .) + + if(is.na(extra_info)) { + contract_expiry <- NA + days_remaining <- NA + } else { + text_to_remove <- contract_box[contract_idx] %>% rvest::html_nodes("b") %>% rvest::html_text() + if(length(text_to_remove) == 0) { + contract_expiry <- NA + days_remaining <- NA + } else { + contract_expiry <- contract_box[contract_idx] %>% + rvest::html_text() %>% + gsub(text_to_remove, "", .) %>% + stringr::str_squish() %>% + gsub(".*\\((.*)\\).*", "\\1", .) %>% + .tm_fix_dates() %>% + lubridate::ymd() + days_remaining <- difftime(contract_expiry, transfer_date, units = c("days")) %>% as.numeric() + } + } + + country_to <- tryCatch( + extra_info %>% + rvest::html_nodes(".large-4.columns table .rechts .flaggenrahmen") %>% + rvest::html_attr("title"), + error = function(e) NA + ) + + if(length(country_to) < 1){ + country_to <- NA + } + + countries <- tryCatch( + extra_info %>% + rvest::html_nodes(".large-4.columns table .flaggenrahmen") %>% + rvest::html_attr("title"), + error = function(e) NA + ) + + if (length(countries) < 2) { + if (is.na(country_to)) { + country_from <- countries[1] + } else { + country_from <- NA + } + } else { + country_from <- countries[1] + } + + .x[["contract_expiry"]] <- contract_expiry + .x[["days_remaining"]] <- days_remaining + .x[["country_from"]] <- country_from + .x[["country_to"]] <- country_to + .x + } + ) + } else { + res <- dplyr::bind_rows(formatted_transfer_history) + } + + res <- dplyr::mutate( + res, + "player_name" = player_name, + .before = 1 + ) + res[["url"]] <- NULL + res + } + + # create the progress bar with a progress function. + pb <- progress::progress_bar$new(total = length(player_urls)) + purrr::map_dfr( + player_urls, + single_player_transfer_history + ) +} From bcfc1596333250db734abf7151464f66cad21ad2 Mon Sep 17 00:00:00 2001 From: tony Date: Thu, 23 Nov 2023 09:03:34 -0600 Subject: [PATCH 2/3] completely replace old function --- NAMESPACE | 6 + R/player_transfer_history.R | 349 ++++++++++++---------------- R/tm_player_transfer_history.R | 147 ------------ tests/testthat/test-transfermarkt.R | 68 ++++-- 4 files changed, 202 insertions(+), 368 deletions(-) delete mode 100644 R/tm_player_transfer_history.R diff --git a/NAMESPACE b/NAMESPACE index 35a608ca..bcc2bd6c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -69,12 +69,14 @@ export(understat_team_season_shots) export(understat_team_stats_breakdown) importFrom(cli,cli_alert) importFrom(dplyr,bind_cols) +importFrom(dplyr,bind_rows) importFrom(dplyr,case_when) importFrom(dplyr,filter) importFrom(dplyr,mutate) importFrom(dplyr,pull) importFrom(dplyr,transmute) importFrom(httr,GET) +importFrom(httr,add_headers) importFrom(httr,content) importFrom(httr,set_cookies) importFrom(jsonlite,fromJSON) @@ -83,6 +85,7 @@ importFrom(lubridate,ymd) importFrom(magrittr,"%>%") importFrom(progress,progress_bar) importFrom(purrr,insistently) +importFrom(purrr,map) importFrom(purrr,map_chr) importFrom(purrr,map_dfr) importFrom(purrr,pluck) @@ -98,6 +101,8 @@ importFrom(rlang,inform) importFrom(rstudioapi,isAvailable) importFrom(rstudioapi,versionInfo) importFrom(rvest,html_attr) +importFrom(rvest,html_children) +importFrom(rvest,html_node) importFrom(rvest,html_nodes) importFrom(rvest,html_table) importFrom(rvest,html_text) @@ -106,6 +111,7 @@ importFrom(stats,runif) importFrom(stats,setNames) importFrom(stringr,str_detect) importFrom(stringr,str_replace_all) +importFrom(stringr,str_squish) importFrom(tibble,tibble) importFrom(tidyr,crossing) importFrom(utils,read.csv) diff --git a/R/player_transfer_history.R b/R/player_transfer_history.R index a5498dbf..8414a440 100644 --- a/R/player_transfer_history.R +++ b/R/player_transfer_history.R @@ -1,3 +1,15 @@ + +.standardize_fee <- function(x) { + case_when( + grepl("free", x) ~ "Free transfer", + grepl("end of", x) ~ "End of loan", + grepl("fee", x) ~ "Paid loan", + grepl("loan", x) ~ "Loan", + grepl("m|th.|k", x) ~ "Transfer", + TRUE ~ NA_character_ + ) +} + #' Get Transfermarkt player transfer history #' #' Returns data frame of player(s) transfer history from transfermarkt.com @@ -9,112 +21,128 @@ #' @return returns a dataframe of player transfers #' #' @importFrom magrittr %>% -#' @importFrom rlang .data +#' @importFrom xml2 read_html +#' @importFrom purrr map map_dfr +#' @importFrom rvest html_node html_nodes html_text html_children html_attr +#' @importFrom stringr str_squish +#' @importFrom httr GET add_headers content +#' @importFrom lubridate ymd +#' @importFrom dplyr mutate bind_rows +#' @importFrom progress progress_bar #' #' @export -#' tm_player_transfer_history <- function(player_urls, get_extra_info = TRUE) { - # .pkg_message("Extracting player transfer history data. Please acknowledge transfermarkt.com as the data source.") single_player_transfer_history <- function(player_url) { pb$tick() - main_url <- "https://www.transfermarkt.com" - page <- xml2::read_html(player_url) - player_name <- page %>% rvest::html_node("h1") %>% rvest::html_text() %>% gsub("#[[:digit:]]+ ", "", .) %>% stringr::str_squish() - - box_holder <- page %>% rvest::html_nodes(".viewport-tracking") - - # need to get the index of the box containing transfer history data - box_idx <- box_holder %>% rvest::html_attr('data-viewport') %>% grep("Transferhistorie", .) - all_transfers_player <- box_holder[box_idx] - - # now want the rows that contain data, but need to remove heading, column headers and footer - # this step is new as transfermarkt USED TO contain this data in a table element - all_transfer_rows <- all_transfers_player %>% rvest::html_children() - rem_rows_idx <- c(grep("Transfer history", all_transfer_rows %>% rvest::html_text()), - grep("Season", all_transfer_rows %>% rvest::html_text()), - grep("Total", all_transfer_rows %>% rvest::html_text())) - # only keep the necessary rows - all_transfer_rows <- all_transfer_rows[-rem_rows_idx] - - # loop through each of the rows - each_player_df <- data.frame() - - for(each_row in 1:length(all_transfer_rows)) { - season <- tryCatch(all_transfer_rows[each_row] %>% rvest::html_nodes(".tm-player-transfer-history-grid__season") %>% rvest::html_text() %>% stringr::str_squish(), error = function(e) NA_character_) - if(rlang::is_empty(season)) { - each_row_df <- data.frame() - } else { - transfer_date <- tryCatch(all_transfer_rows[each_row] %>% rvest::html_nodes(".tm-player-transfer-history-grid__date") %>% rvest::html_text() %>% stringr::str_squish() %>% - .tm_fix_dates() %>% lubridate::ymd(), error = function(e) NA_character_) - - team_from <- tryCatch(all_transfer_rows[each_row] %>% rvest::html_nodes(".tm-player-transfer-history-grid__old-club .tm-player-transfer-history-grid__club-link") %>% rvest::html_text() %>% - stringr::str_squish(), error = function(e) NA_character_) - team_to <- tryCatch(all_transfer_rows[each_row] %>% rvest::html_nodes(".tm-player-transfer-history-grid__new-club .tm-player-transfer-history-grid__club-link") %>% rvest::html_text() %>% - stringr::str_squish(), error = function(e) NA_character_) - - market_value <- tryCatch(all_transfer_rows[each_row] %>% rvest::html_nodes(".tm-player-transfer-history-grid__market-value") %>% rvest::html_text() %>% - stringr::str_squish() %>% - .convert_value_to_numeric(), error = function(e) NA_character_) - - transfer_value_info <- tryCatch(all_transfer_rows[each_row] %>% rvest::html_nodes(".tm-player-transfer-history-grid__fee") %>% rvest::html_text() %>% - stringr::str_squish() %>% tolower, error = function(e) NA_character_) - - transfer_value <- transfer_value_info %>% .convert_value_to_numeric - - if(grepl("free", transfer_value_info)) { - transfer_type <- 'Free transfer' - } else if(grepl("end of", transfer_value_info)) { - transfer_type <- 'End of loan' - } else if(grepl("fee", transfer_value_info)) { - transfer_type <- 'Paid loan' - } else if(grepl("loan", transfer_value_info)) { - transfer_type <- 'Loan' - } else if(grepl("m", transfer_value_info)) { - transfer_type <- 'Transfer' - } else if(grepl("th.", transfer_value_info)) { - transfer_type <- 'Transfer' - } else if(grepl("k", transfer_value_info)) { - transfer_type <- 'Transfer' - } else { - transfer_type <- NA_character_ - } - - # Executed if the user wants to get more info. Contains: Contract Expiring date + Days remaining. From which countries was the transfer. - if(get_extra_info == TRUE){ + player_name <- page %>% + rvest::html_node("h1") %>% + rvest::html_text() %>% + gsub("#[[:digit:]]+ ", "", .) %>% + stringr::str_squish() + + player_id <- basename(player_url) + transfer_history_resp <- httr::GET( + url = paste0("https://www.transfermarkt.com/ceapi/transferHistory/list/", player_id), + httr::add_headers(.headers = c( + `User-Agent` = getOption("worldfootballR.agent") + )) + ) + + transfer_history_content <- content(transfer_history_resp) + raw_transfer_history <- transfer_history_content[["transfers"]] + + formatted_transfer_history <- purrr::map( + raw_transfer_history, + \(.x) { + list( + "url" = .x[["url"]], + "season" = .x[["season"]], + "transfer_date" = lubridate::ymd(.x[["dateUnformatted"]]), + "team_from" = .x[["from"]][["clubName"]], + "team_to" = .x[["to"]][["clubName"]], + "market_value" = .convert_value_to_numeric(.x[["marketValue"]]), + "transfer_value" = .convert_value_to_numeric(.x[["fee"]]), + "transfer_type" = .standardize_fee(.x[["fee"]]) + ) + } + ) + + # Executed if the user wants to get more info. Contains: Contract Expiring date + Days remaining. From which countries was the transfer. + res <- if (isTRUE(get_extra_info)) { + purrr::map_dfr( + formatted_transfer_history, + \(.x) { + url <- .x$url + extra_info <- tryCatch( + xml2::read_html(paste0("https://www.transfermarkt.com", url)), + error = function(e) NA + ) + if (is.na(extra_info)) { + return(.x) + } + contract_box <- extra_info %>% + rvest::html_nodes(".large-4.columns") %>% + rvest::html_node("table") %>% + rvest::html_children() - extra_info_url <- all_transfer_rows[each_row] %>% rvest::html_nodes(".tm-player-transfer-history-grid__link") %>% rvest::html_attr("href") %>% paste0(main_url, .) - extra_info <- tryCatch(xml2::read_html(extra_info_url), error = function(e) NA) - contract_box <- extra_info %>% rvest::html_nodes(".large-4.columns") %>% rvest::html_node("table") %>% rvest::html_children() - contract_idx <- grep("Remaining contract duration", contract_box %>% rvest::html_text()) + contract_idx <- contract_box %>% + rvest::html_text() %>% + grep("Remaining contract duration", .) if(is.na(extra_info)) { contract_expiry <- NA days_remaining <- NA } else { - text_to_remove <- contract_box[contract_idx] %>% rvest::html_nodes("b") %>% rvest::html_text() + text_to_remove <- contract_box[contract_idx] %>% + rvest::html_nodes("b") %>% + rvest::html_text() + if(length(text_to_remove) == 0) { contract_expiry <- NA days_remaining <- NA } else { - contract_expiry <- contract_box[contract_idx] %>% rvest::html_text() %>% - gsub(text_to_remove, "", .) %>% stringr::str_squish() %>% gsub(".*\\((.*)\\).*", "\\1", .) %>% .tm_fix_dates() %>% lubridate::ymd() - days_remaining <- difftime(contract_expiry, transfer_date, units = c("days")) %>% as.numeric() + + contract_expiry <- contract_box[contract_idx] %>% + rvest::html_text() %>% + gsub(text_to_remove, "", .) %>% + stringr::str_squish() %>% + gsub(".*\\((.*)\\).*", "\\1", .) %>% + .tm_fix_dates() %>% + lubridate::ymd() + + days_remaining <- difftime( + contract_expiry, + .x$transfer_date, + units = c("days") + ) %>% + as.numeric() } } - country_to <- tryCatch(extra_info %>% rvest::html_nodes(".large-4.columns table .rechts .flaggenrahmen") %>% rvest::html_attr("title"), error = function(e) NA) + country_to <- tryCatch( + extra_info %>% + rvest::html_nodes(".large-4.columns table .rechts .flaggenrahmen") %>% + rvest::html_attr("title"), + error = function(e) NA + ) if(length(country_to) < 1){ country_to <- NA } - countries <- tryCatch(extra_info %>% rvest::html_nodes(".large-4.columns table .flaggenrahmen") %>% rvest::html_attr("title"), error = function(e) NA) - if(length(countries) < 2){ - if(is.na(country_to)){ + countries <- tryCatch( + extra_info %>% + rvest::html_nodes(".large-4.columns table .flaggenrahmen") %>% + rvest::html_attr("title"), + error = function(e) NA + ) + + if (length(countries) < 2) { + if (is.na(country_to)) { country_from <- countries[1] } else { country_from <- NA @@ -122,42 +150,53 @@ tm_player_transfer_history <- function(player_urls, get_extra_info = TRUE) { } else { country_from <- countries[1] } - } - if(get_extra_info == TRUE){ - each_row_df <- data.frame(player_name=as.character(player_name), season=as.character(season), transfer_date=lubridate::ymd(transfer_date), - country_from=as.character(country_from), team_from=as.character(team_from), country_to=as.character(country_to), - team_to=as.character(team_to), market_value=as.numeric(market_value), transfer_value=as.numeric(transfer_value), - transfer_type = as.character(transfer_type), contract_expiry=lubridate::ymd(contract_expiry), - days_remaining=as.numeric(days_remaining)) - } else { - each_row_df <- data.frame(player_name=as.character(player_name), season=as.character(season), transfer_date=lubridate::ymd(transfer_date), - team_from=as.character(team_from), team_to=as.character(team_to), market_value=as.numeric(market_value), - transfer_value=as.numeric(transfer_value), transfer_type = as.character(transfer_type)) + .x[["contract_expiry"]] <- contract_expiry + .x[["days_remaining"]] <- days_remaining + .x[["country_from"]] <- country_from + .x[["country_to"]] <- country_to + .x } - - } - - each_player_df <- rbind(each_player_df, each_row_df) + ) %>% + ## for backwards compatability + dplyr::select( + tidyselect::vars_select_helpers$all_of( + c( + "season", + "transfer_date", + "country_from", + "team_from", + "country_to", + "team_to", + "market_value", + "transfer_value", + "transfer_type", + "contract_expiry", + "days_remaining" + ) + ) + ) + } else { + dplyr::bind_rows(formatted_transfer_history) } - return(each_player_df) + res <- dplyr::mutate( + res, + "player_name" = player_name, + .before = 1 + ) + res[["url"]] <- NULL + res } # create the progress bar with a progress function. pb <- progress::progress_bar$new(total = length(player_urls)) - - all_players <- player_urls %>% purrr::map_df(single_player_transfer_history) - - return(all_players) + purrr::map_dfr( + player_urls, + single_player_transfer_history + ) } - - - - - - #' Get player transfer history #' #' Returns data frame of player(s) transfer history from transfermarkt.com @@ -174,104 +213,6 @@ tm_player_transfer_history <- function(player_urls, get_extra_info = TRUE) { player_transfer_history <- function(player_urls) { .Deprecated("tm_player_transfer_history") - # .pkg_message("Extracting player transfer history data. Please acknowledge transfermarkt.com as the data source.") - - single_player_transfer_history <- function(player_url) { - pb$tick() - - main_url <- "https://www.transfermarkt.com" - - page <- xml2::read_html(player_url) - - player_name <- page %>% rvest::html_node("h1") %>% rvest::html_text() %>% gsub("#[[:digit:]]+ ", "", .) %>% stringr::str_squish() - - box_holder <- page %>% rvest::html_nodes(".viewport-tracking") - - # need to get the index of the box containing transfer history data - box_idx <- box_holder %>% rvest::html_attr('data-viewport') %>% grep("Transferhistorie", .) - all_transfers_player <- box_holder[box_idx] - - # now want the rows that contain data, but need to remove heading, column headers and footer - # this step is new as transfermarkt USED TO contain this data in a table element - all_transfer_rows <- all_transfers_player %>% rvest::html_children() - rem_rows_idx <- c(grep("Transfer history", all_transfer_rows %>% rvest::html_text()), - grep("Season", all_transfer_rows %>% rvest::html_text()), - grep("Total", all_transfer_rows %>% rvest::html_text())) - # only keep the necessary rows - all_transfer_rows <- all_transfer_rows[-rem_rows_idx] - - # loop through each of the rows - each_player_df <- data.frame() - - for(each_row in 1:length(all_transfer_rows)) { - season <- tryCatch(all_transfer_rows[each_row] %>% rvest::html_nodes(".tm-player-transfer-history-grid__season") %>% rvest::html_text() %>% stringr::str_squish(), error = function(e) NA_character_) - if(rlang::is_empty(season)) { - each_row_df <- data.frame() - } else { - transfer_date <- tryCatch(all_transfer_rows[each_row] %>% rvest::html_nodes(".tm-player-transfer-history-grid__date") %>% rvest::html_text() %>% stringr::str_squish() %>% - .tm_fix_dates() %>% lubridate::ymd(), error = function(e) NA_character_) - - # country_flags <- tryCatch(all_transfer_rows[each_row] %>% rvest::html_nodes(".flagge"), error = function(e) NA) - country_from <- tryCatch(all_transfer_rows[each_row] %>% rvest::html_nodes(".tm-player-transfer-history-grid__old-club .tm-player-transfer-history-grid__flag") %>% rvest::html_attr("alt") %>% stringr::str_squish() %>% .replace_empty_na(), error = function(e) NA_character_) - country_to <- tryCatch(all_transfer_rows[each_row] %>% rvest::html_nodes(".tm-player-transfer-history-grid__new-club .tm-player-transfer-history-grid__flag") %>% rvest::html_attr("alt"), error = function(e) NA_character_) - # to handle for players that are retired, like: "https://www.transfermarkt.com/massimiliano-allegri/profil/spieler/163501": - if(rlang::is_empty(country_to)) { - country_to <- tryCatch(all_transfer_rows[each_row] %>% rvest::html_nodes(".tm-player-transfer-history-grid__new-club .tm-player-transfer-history-grid__club-link") %>% rvest::html_text() %>% stringr::str_squish(), error = function(e) NA_character_) - } - - team_from <- tryCatch(all_transfer_rows[each_row] %>% rvest::html_nodes(".tm-player-transfer-history-grid__old-club .tm-player-transfer-history-grid__club-link") %>% rvest::html_text() %>% - stringr::str_squish(), error = function(e) NA_character_) - team_to <- tryCatch(all_transfer_rows[each_row] %>% rvest::html_nodes(".tm-player-transfer-history-grid__new-club .tm-player-transfer-history-grid__club-link") %>% rvest::html_text() %>% - stringr::str_squish(), error = function(e) NA_character_) - - market_value <- tryCatch(all_transfer_rows[each_row] %>% rvest::html_nodes(".tm-player-transfer-history-grid__market-value") %>% rvest::html_text() %>% - stringr::str_squish() %>% - .convert_value_to_numeric(), error = function(e) NA_character_) - transfer_value <- tryCatch(all_transfer_rows[each_row] %>% rvest::html_nodes(".tm-player-transfer-history-grid__fee") %>% rvest::html_text() %>% - stringr::str_squish() %>% - .convert_value_to_numeric, error = function(e) NA_character_) - - # to get contract length, which isn't on the main page listing all transfers: - extra_info_url <- all_transfer_rows[each_row] %>% rvest::html_nodes(".tm-player-transfer-history-grid__link") %>% rvest::html_attr("href") %>% paste0(main_url, .) - extra_info <- tryCatch(xml2::read_html(extra_info_url), error = function(e) NA) - contract_box <- extra_info %>% rvest::html_nodes(".large-4.columns") %>% rvest::html_node("table") %>% rvest::html_children() - contract_idx <- grep("Remaining contract duration", contract_box %>% rvest::html_text()) - if(is.na(extra_info)) { - contract_expiry <- NA - days_remaining <- NA - } else { - text_to_remove <- contract_box[contract_idx] %>% rvest::html_nodes("b") %>% rvest::html_text() - if(length(text_to_remove) == 0) { - contract_expiry <- NA - days_remaining <- NA - } else { - contract_expiry <- contract_box[contract_idx] %>% rvest::html_text() %>% - gsub(text_to_remove, "", .) %>% stringr::str_squish() %>% gsub(".*\\((.*)\\).*", "\\1", .) %>% .tm_fix_dates() %>% lubridate::ymd() - days_remaining <- difftime(contract_expiry, transfer_date, units = c("days")) %>% as.numeric() - } - - } - - - each_row_df <- data.frame(player_name=as.character(player_name), season=as.character(season), transfer_date=lubridate::ymd(transfer_date), - country_from=as.character(country_from), team_from=as.character(team_from), country_to=as.character(country_to), - team_to=as.character(team_to), market_value=as.numeric(market_value), transfer_value=as.numeric(transfer_value), - contract_expiry=lubridate::ymd(contract_expiry), days_remaining=as.numeric(days_remaining)) - - } - - each_player_df <- rbind(each_player_df, each_row_df) - } - - return(each_player_df) - } - - # create the progress bar with a progress function. - pb <- progress::progress_bar$new(total = length(player_urls)) - - all_players <- player_urls %>% - purrr::map_df(single_player_transfer_history) - - return(all_players) + tm_player_transfer_history(player_urls) } diff --git a/R/tm_player_transfer_history.R b/R/tm_player_transfer_history.R deleted file mode 100644 index 5d4cb740..00000000 --- a/R/tm_player_transfer_history.R +++ /dev/null @@ -1,147 +0,0 @@ - -.standardize_fee <- function(x) { - case_when( - grepl("free", x) ~ "Free transfer", - grepl("end of", x) ~ "End of loan", - grepl("fee", x) ~ "Paid loan", - grepl("loan", x) ~ "Loan", - grepl("m|th.|k", x) ~ "Transfer", - TRUE ~ NA_character_ - ) -} - -tm_player_transfer_history2 <- function(player_urls, get_extra_info = TRUE) { - - player_url <- "https://www.transfermarkt.com/jack-rodwell/profil/spieler/57079" - get_extra_info <- TRUE - - single_player_transfer_history <- function(player_url) { - pb$tick() - - main_url <- "https://www.transfermarkt.us.com" - - page <- xml2::read_html(player_url) - - player_name <- page %>% - rvest::html_node("h1") %>% - rvest::html_text() %>% - gsub("#[[:digit:]]+ ", "", .) %>% - stringr::str_squish() - - player_id <- basename(player_url) - transfer_history_resp <- httr::GET( - url = paste0("https://www.transfermarkt.com/ceapi/transferHistory/list/", player_id), - httr::add_headers(.headers = c( - `User-Agent` = getOption("worldfootballR.agent") - )) - ) - - transfer_history_content <- content(transfer_history_resp) - raw_transfer_history <- transfer_history_content[["transfers"]] - - formatted_transfer_history <- purrr::map( - raw_transfer_history, - \(.x) { - list( - "url" = .x[["url"]], - "season" = .x[["season"]], - "transfer_date" = lubridate::ymd(.x[["dateUnformatted"]]), - "team_from" = .x[["from"]][["clubName"]], - "team_to" = .x[["to"]][["clubName"]], - "market_value" = .convert_value_to_numeric(.x[["marketValue"]]), - "fee" = .standardize_fee(.x[["fee"]]) - ) - } - ) - - # Executed if the user wants to get more info. Contains: Contract Expiring date + Days remaining. From which countries was the transfer. - if (isTRUE(get_extra_info)) { - res <- purrr::map( - formatted_transfer_history, - \(.x) { - url <- .x$url - extra_info <- tryCatch(xml2::read_html(url), error = function(e) NA) - contract_box <- extra_info %>% - rvest::html_nodes(".large-4.columns") %>% - rvest::html_node("table") %>% - rvest::html_children() - - contract_idx <- contract_box %>% - rvest::html_text() %>% - grep("Remaining contract duration", .) - - if(is.na(extra_info)) { - contract_expiry <- NA - days_remaining <- NA - } else { - text_to_remove <- contract_box[contract_idx] %>% rvest::html_nodes("b") %>% rvest::html_text() - if(length(text_to_remove) == 0) { - contract_expiry <- NA - days_remaining <- NA - } else { - contract_expiry <- contract_box[contract_idx] %>% - rvest::html_text() %>% - gsub(text_to_remove, "", .) %>% - stringr::str_squish() %>% - gsub(".*\\((.*)\\).*", "\\1", .) %>% - .tm_fix_dates() %>% - lubridate::ymd() - days_remaining <- difftime(contract_expiry, transfer_date, units = c("days")) %>% as.numeric() - } - } - - country_to <- tryCatch( - extra_info %>% - rvest::html_nodes(".large-4.columns table .rechts .flaggenrahmen") %>% - rvest::html_attr("title"), - error = function(e) NA - ) - - if(length(country_to) < 1){ - country_to <- NA - } - - countries <- tryCatch( - extra_info %>% - rvest::html_nodes(".large-4.columns table .flaggenrahmen") %>% - rvest::html_attr("title"), - error = function(e) NA - ) - - if (length(countries) < 2) { - if (is.na(country_to)) { - country_from <- countries[1] - } else { - country_from <- NA - } - } else { - country_from <- countries[1] - } - - .x[["contract_expiry"]] <- contract_expiry - .x[["days_remaining"]] <- days_remaining - .x[["country_from"]] <- country_from - .x[["country_to"]] <- country_to - .x - } - ) - } else { - res <- dplyr::bind_rows(formatted_transfer_history) - } - - res <- dplyr::mutate( - res, - "player_name" = player_name, - .before = 1 - ) - res[["url"]] <- NULL - res - } - - # create the progress bar with a progress function. - pb <- progress::progress_bar$new(total = length(player_urls)) - purrr::map_dfr( - player_urls, - single_player_transfer_history - ) -} diff --git a/tests/testthat/test-transfermarkt.R b/tests/testthat/test-transfermarkt.R index 88f9849e..9e2c7160 100644 --- a/tests/testthat/test-transfermarkt.R +++ b/tests/testthat/test-transfermarkt.R @@ -11,24 +11,58 @@ test_that("tm_player_market_values() works", { }) +test_that("tm_player_transfer_history() works", { + testthat::skip_if_offline() + testthat::skip_on_cran() + + ## multiple URLs + transfer_data <- tm_player_transfer_history( + c( + "https://www.transfermarkt.com/cristiano-ronaldo/profil/spieler/8198", + "https://www.transfermarkt.com/jack-rodwell/profil/spieler/57079" + ) + ) + + expected_base_transfer_data_cols <- c( + "player_name", + "season", + "transfer_date", + "team_from", + "team_to", + "market_value", + "transfer_value", + "transfer_type" + ) + expected_extra_transfer_data_cols <- c( + "country_from", + "country_to", + "contract_expiry", + "days_remaining" + ) + + # test the functions returns the data + expect_type(transfer_data, "list") + expect_true(nrow(transfer_data) > 0) + expect_setequal( + colnames(transfer_data), c(expected_base_transfer_data_cols, expected_extra_transfer_data_cols) + ) + expect_true(length(unique(transfer_data$player_name)) == 2L) + + ## non default params + transfer_data_no_extra_info <- tm_player_transfer_history( + "https://www.transfermarkt.com/jack-rodwell/profil/spieler/57079", + get_extra_info = FALSE + ) + + expect_true(nrow(transfer_data_no_extra_info) > 0) + expect_setequal( + colnames(transfer_data_no_extra_info), expected_base_transfer_data_cols + ) -##-------------------------------------------------------------------------------------- -# As at version 0.6.4.0012, commenting out this test as the data is no longer available -# by static scraping and will need to incorporate some form of browser automation -##-------------------------------------------------------------------------------------- -# test_that("tm_player_transfer_history() works", { -# testthat::skip_if_offline() -# testthat::skip_on_cran() -# -# transfer_data <- tm_player_transfer_history(c("https://www.transfermarkt.com/cristiano-ronaldo/profil/spieler/8198")) -# # test the functions returns the data -# expect_type(transfer_data, "list") -# expect_true(ncol(transfer_data) != 0) -# -# # test that an invalid country will error -# expect_error(tm_player_transfer_history("aaa.com.au")) -# -# }) + # test that an invalid country will error + expect_error(tm_player_transfer_history("aaa.com.au")) + +}) test_that("tm_team_transfers() works", { From f8c7ab5d20cc52bd23734519f41fd8f75e33c1a3 Mon Sep 17 00:00:00 2001 From: tony Date: Thu, 23 Nov 2023 09:15:43 -0600 Subject: [PATCH 3/3] update news --- DESCRIPTION | 2 +- NAMESPACE | 2 ++ NEWS.md | 2 +- R/player_transfer_history.R | 3 ++- 4 files changed, 6 insertions(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index adca2e50..8d8789db 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: worldfootballR Title: Extract and Clean World Football (Soccer) Data -Version: 0.6.4.0012 +Version: 0.6.4.0013 Authors@R: c( person("Jason", "Zivkovic", , "jaseziv83@gmail.com", role = c("aut", "cre", "cph")), person("Tony", "ElHabr", , "anthonyelhabr@gmail.com", role = "ctb"), diff --git a/NAMESPACE b/NAMESPACE index bcc2bd6c..ee9afc39 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -74,6 +74,7 @@ importFrom(dplyr,case_when) importFrom(dplyr,filter) importFrom(dplyr,mutate) importFrom(dplyr,pull) +importFrom(dplyr,select) importFrom(dplyr,transmute) importFrom(httr,GET) importFrom(httr,add_headers) @@ -114,6 +115,7 @@ importFrom(stringr,str_replace_all) importFrom(stringr,str_squish) importFrom(tibble,tibble) importFrom(tidyr,crossing) +importFrom(tidyselect,vars_select_helpers) importFrom(utils,read.csv) importFrom(utils,sessionInfo) importFrom(xml2,read_html) diff --git a/NEWS.md b/NEWS.md index 787d82bb..936cc67e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -9,7 +9,7 @@ * `fb_league_stats(team_or_player = "player")` returning wrong season's data for Australian league (0.6.4.0009) [#333](https://github.com/JaseZiv/worldfootballR/issues/333) * `tm_player_market_values()` returing the player name and valuation on separate rows in the `player_name` column [#338] (https://github.com/JaseZiv/worldfootballR/issues/338) and also returning `NA`s for the `player_age` column (0.6.4.0010) [#336](https://github.com/JaseZiv/worldfootballR/issues/336) * `fb_match_results()` returns `NA` goals due to inconsistent `iconv()` behvaior on different systems (0.6.4.0011) [#326](https://github.com/JaseZiv/worldfootballR/issues/326) -* `tm_team_player_urls()` html element changed on transfermarkt so stopped returning values (0.6.4.0012) +* `tm_team_player_urls()` fixed after change to server-side loading (0.6.4.0013) [#342](https://github.com/JaseZiv/worldfootballR/issues/342) ### Improvements diff --git a/R/player_transfer_history.R b/R/player_transfer_history.R index 8414a440..90d4c18c 100644 --- a/R/player_transfer_history.R +++ b/R/player_transfer_history.R @@ -27,8 +27,9 @@ #' @importFrom stringr str_squish #' @importFrom httr GET add_headers content #' @importFrom lubridate ymd -#' @importFrom dplyr mutate bind_rows +#' @importFrom dplyr mutate bind_rows select #' @importFrom progress progress_bar +#' @importFrom tidyselect vars_select_helpers #' #' @export tm_player_transfer_history <- function(player_urls, get_extra_info = TRUE) {