Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generalize player table scraping in fb_league_stats() #359

Merged
merged 6 commits into from
Jan 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: worldfootballR
Title: Extract and Clean World Football (Soccer) Data
Version: 0.6.5.0002
Version: 0.6.5.0003
Authors@R: c(
person("Jason", "Zivkovic", , "[email protected]", role = c("aut", "cre", "cph")),
person("Tony", "ElHabr", , "[email protected]", role = "ctb"),
Expand Down
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
* `fb_league_stats()` not returning `opponent` table. (0.6.5.0001) [#355](https://github.com/JaseZiv/worldfootballR/issues/355)
* `tm_player_bio()` not returning values in the `player_valuation`, `max_player_valuation` and `max_player_valuation_date` fields. Unfortunately, `max_player_valuation` and `max_player_valuation_date` fields are no able to be scraped at this release (0.6.5.0002) [#357](https://github.com/JaseZiv/worldfootballR/issues/357)

* `fb_league_stats()` not returning `player` table when hidden on page load. (0.6.5.0003) [#351](https://github.com/JaseZiv/worldfootballR/issues/351)

***

# worldfootballR 0.6.5
Expand Down
29 changes: 18 additions & 11 deletions R/chromote-fbref.R
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,10 @@ WorldfootballRDynamicPage <- R6::R6Class("WorldfootballRDynamicPage", public = l
unlist(self$session$DOM$querySelectorAll(self$root_id, css)$nodeIds)
},

call_node_method = function(node_id) {
js_fun <- paste0("function() { return this.outerHTML}")
call_node_method = function(node_id, method, ...) {
js_fun <- paste0("function() { return this", method, "}")
obj_id <- self$object_id(node_id)
self$session$Runtime$callFunctionOn(js_fun, objectId = obj_id)
self$session$Runtime$callFunctionOn(js_fun, objectId = obj_id, ...)
},

object_id = function(node_id) {
Expand All @@ -41,14 +41,21 @@ WorldfootballRDynamicPage <- R6::R6Class("WorldfootballRDynamicPage", public = l
#' @importFrom purrr map_chr
#' @importFrom xml2 xml_children read_html
#' @noRd
worldfootballr_html_page <- function(x) {
stopifnot(identical(class(x), c("WorldfootballRDynamicPage", "R6")))
nodes <- x$find_nodes("table")

elements <- purrr::map_chr(nodes, function(node_id) {
json <- x$call_node_method(node_id)
json$result$value
})
worldfootballr_html_player_table <- function(session) {
stopifnot(identical(class(session), c("WorldfootballRDynamicPage", "R6")))

## find element "above" commented out table
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

note that i couldn't figure out how to identify the element with the commented out table directly, so i've opted to do it "indirectly", by identifying the node above it (which always has the same CSS class).

node_id0 <- session$find_nodes("#stats_shooting_sh")
## skip 1 for the div "placeholder"
node_id <- node_id0 + 2L

elements <- session$call_node_method(node_id, ".textContent")[["result"]][["value"]]
n_elements <- length(elements)
if (n_elements != 1) {
warning(sprintf("Did not find the expected number of tables on the page (3). Found %s.", n_elements))
return(NULL)
}

html <- paste0("<html>", paste0(elements, collapse = "\n"), "</html>")
xml2::read_html(html)
}
39 changes: 19 additions & 20 deletions R/fb_league_stats.R
Original file line number Diff line number Diff line change
Expand Up @@ -36,34 +36,33 @@
.frequency = "once",
.frequency_id = "fb_league_stats-player"
)

session <- worldfootballr_chromote_session(url)
page <- worldfootballr_html_page(session)
player_table <- worldfootballr_html_player_table(session)
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the new solution is a little more specific. this function only returns the HTML for 1 table, while the prior solution returned the HTML for all (3) tables on the page, from which the player table was later plucked. arguably this new solution is better

session$session$close(wait_ = FALSE)
elements <- xml2::xml_children(xml2::xml_children(page))
tables <- rvest::html_table(elements)

n_tables <- length(tables)
if (n_tables != 3) {
warning(sprintf("Did not find the expected number of tables on the page (3). Found %s.", n_tables))
if (is.null(player_table)) {
return(tibble::tibble())
}
renamed_table <- .rename_fb_cols(tables[[3]])
renamed_table <- renamed_table[renamed_table$Rk != "Rk", ]
renamed_table <- .add_player_href(
renamed_table,
parent_element = elements[[3]],

player_table_elements <- xml2::xml_children(xml2::xml_children(player_table))
parsed_player_table <- rvest::html_table(player_table_elements)
renamed_player_table <- .rename_fb_cols(parsed_player_table[[1]])
renamed_player_table <- renamed_player_table[renamed_player_table$Rk != "Rk", ]
renamed_player_table <- .add_player_href(
renamed_player_table,
parent_element = player_table_elements,
player_xpath = ".//tbody/tr/td[@data-stat='player']/a"
)
}

suppressMessages(
readr::type_convert(
clean_table,
guess_integer = TRUE,
na = "",
trim_ws = TRUE
suppressMessages(
readr::type_convert(
renamed_player_table,
guess_integer = TRUE,
na = "",
trim_ws = TRUE
)
Comment on lines +48 to +63
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there's basically no logic change here. i've just added _player to the variable names

)
)
}
}


Expand Down
6 changes: 3 additions & 3 deletions tests/testthat/test-fbref.R
Original file line number Diff line number Diff line change
Expand Up @@ -406,16 +406,16 @@ test_that("fb_league_stats() for players works", {
testthat::skip_on_cran()
testthat::skip_on_ci()
expected_player_shooting_cols <- c("Rk", "Player", "Player_Href", "Nation", "Pos", "Squad", "Age", "Born", "Mins_Per_90", "Gls_Standard", "Sh_Standard", "SoT_Standard", "SoT_percent_Standard", "Sh_per_90_Standard", "SoT_per_90_Standard", "G_per_Sh_Standard", "G_per_SoT_Standard", "Dist_Standard", "FK_Standard", "PK_Standard", "PKatt_Standard", "xG_Expected", "npxG_Expected", "npxG_per_Sh_Expected", "G_minus_xG_Expected", "np:G_minus_xG_Expected", "Matches", "url")
epl_player_shooting_22 <- fb_league_stats(
single_player_shooting_22 <- fb_league_stats(
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

realized this variable has epl_ in it but we're scraping for Brazil. i've renamed it to single_ to implicitly reflect its usage for testing just 1 league

country = "BRA",
gender = "M",
season_end_year = 2022,
tier = "1st",
stat_type = "shooting",
team_or_player = "player"
)
expect_gt(nrow(epl_player_shooting_22), 0)
expect_setequal(colnames(epl_player_shooting_22), expected_player_shooting_cols)
expect_gt(nrow(single_player_shooting_22), 0)
expect_setequal(colnames(single_player_shooting_22), expected_player_shooting_cols)

expected_player_misc_cols <- c("Rk", "Player", "Player_Href", "Nation", "Pos", "Squad", "Age", "Born", "Mins_Per_90", "CrdY", "CrdR", "2CrdY", "Fls", "Fld", "Off", "Crs", "Int", "TklW", "PKwon", "PKcon", "OG", "Recov", "Won_Aerial Duels", "Lost_Aerial Duels", "Won_percent_Aerial Duels", "Matches", "url")
## testing a lot would take too long, so just test multiple years since that is the most likely input param to have multiple values
Expand Down
Loading