-
Notifications
You must be signed in to change notification settings - Fork 62
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Generalize player table scraping in fb_league_stats()
#359
Changes from all commits
912875a
35888ec
87f0f47
23aa57c
b65ea84
3a5113a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
Type: Package | ||
Package: worldfootballR | ||
Title: Extract and Clean World Football (Soccer) Data | ||
Version: 0.6.5.0002 | ||
Version: 0.6.5.0003 | ||
Authors@R: c( | ||
person("Jason", "Zivkovic", , "[email protected]", role = c("aut", "cre", "cph")), | ||
person("Tony", "ElHabr", , "[email protected]", role = "ctb"), | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -36,34 +36,33 @@ | |
.frequency = "once", | ||
.frequency_id = "fb_league_stats-player" | ||
) | ||
|
||
session <- worldfootballr_chromote_session(url) | ||
page <- worldfootballr_html_page(session) | ||
player_table <- worldfootballr_html_player_table(session) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the new solution is a little more specific. this function only returns the HTML for 1 table, while the prior solution returned the HTML for all (3) tables on the page, from which the player table was later plucked. arguably this new solution is better |
||
session$session$close(wait_ = FALSE) | ||
elements <- xml2::xml_children(xml2::xml_children(page)) | ||
tables <- rvest::html_table(elements) | ||
|
||
n_tables <- length(tables) | ||
if (n_tables != 3) { | ||
warning(sprintf("Did not find the expected number of tables on the page (3). Found %s.", n_tables)) | ||
if (is.null(player_table)) { | ||
return(tibble::tibble()) | ||
} | ||
renamed_table <- .rename_fb_cols(tables[[3]]) | ||
renamed_table <- renamed_table[renamed_table$Rk != "Rk", ] | ||
renamed_table <- .add_player_href( | ||
renamed_table, | ||
parent_element = elements[[3]], | ||
|
||
player_table_elements <- xml2::xml_children(xml2::xml_children(player_table)) | ||
parsed_player_table <- rvest::html_table(player_table_elements) | ||
renamed_player_table <- .rename_fb_cols(parsed_player_table[[1]]) | ||
renamed_player_table <- renamed_player_table[renamed_player_table$Rk != "Rk", ] | ||
renamed_player_table <- .add_player_href( | ||
renamed_player_table, | ||
parent_element = player_table_elements, | ||
player_xpath = ".//tbody/tr/td[@data-stat='player']/a" | ||
) | ||
} | ||
|
||
suppressMessages( | ||
readr::type_convert( | ||
clean_table, | ||
guess_integer = TRUE, | ||
na = "", | ||
trim_ws = TRUE | ||
suppressMessages( | ||
readr::type_convert( | ||
renamed_player_table, | ||
guess_integer = TRUE, | ||
na = "", | ||
trim_ws = TRUE | ||
) | ||
Comment on lines
+48
to
+63
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. there's basically no logic change here. i've just added |
||
) | ||
) | ||
} | ||
} | ||
|
||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -406,16 +406,16 @@ test_that("fb_league_stats() for players works", { | |
testthat::skip_on_cran() | ||
testthat::skip_on_ci() | ||
expected_player_shooting_cols <- c("Rk", "Player", "Player_Href", "Nation", "Pos", "Squad", "Age", "Born", "Mins_Per_90", "Gls_Standard", "Sh_Standard", "SoT_Standard", "SoT_percent_Standard", "Sh_per_90_Standard", "SoT_per_90_Standard", "G_per_Sh_Standard", "G_per_SoT_Standard", "Dist_Standard", "FK_Standard", "PK_Standard", "PKatt_Standard", "xG_Expected", "npxG_Expected", "npxG_per_Sh_Expected", "G_minus_xG_Expected", "np:G_minus_xG_Expected", "Matches", "url") | ||
epl_player_shooting_22 <- fb_league_stats( | ||
single_player_shooting_22 <- fb_league_stats( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. realized this variable has |
||
country = "BRA", | ||
gender = "M", | ||
season_end_year = 2022, | ||
tier = "1st", | ||
stat_type = "shooting", | ||
team_or_player = "player" | ||
) | ||
expect_gt(nrow(epl_player_shooting_22), 0) | ||
expect_setequal(colnames(epl_player_shooting_22), expected_player_shooting_cols) | ||
expect_gt(nrow(single_player_shooting_22), 0) | ||
expect_setequal(colnames(single_player_shooting_22), expected_player_shooting_cols) | ||
|
||
expected_player_misc_cols <- c("Rk", "Player", "Player_Href", "Nation", "Pos", "Squad", "Age", "Born", "Mins_Per_90", "CrdY", "CrdR", "2CrdY", "Fls", "Fld", "Off", "Crs", "Int", "TklW", "PKwon", "PKcon", "OG", "Recov", "Won_Aerial Duels", "Lost_Aerial Duels", "Won_percent_Aerial Duels", "Matches", "url") | ||
## testing a lot would take too long, so just test multiple years since that is the most likely input param to have multiple values | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
note that i couldn't figure out how to identify the element with the commented out table directly, so i've opted to do it "indirectly", by identifying the node above it (which always has the same CSS class).