diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 2a3b6db..0f2fe08 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -22,14 +22,9 @@ jobs: config: - {os: macos-latest, r: 'release'} - {os: windows-latest, r: 'release'} - # use 4.1 to check with rtools40's older compiler - - {os: windows-latest, r: '4.1'} - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} - {os: ubuntu-latest, r: 'release'} - {os: ubuntu-latest, r: 'oldrel-1'} - - {os: ubuntu-latest, r: 'oldrel-2'} - - {os: ubuntu-latest, r: 'oldrel-3'} - - {os: ubuntu-latest, r: 'oldrel-4'} env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} diff --git a/DESCRIPTION b/DESCRIPTION index c2763d2..20a2d64 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -3,8 +3,9 @@ Package: tidytuesdayR Title: Access the Weekly 'TidyTuesday' Project Dataset Version: 1.0.3.9000 Authors@R: c( - person("Ellis", "Hughes", , "ellishughes@live.com", role = c("aut", "cre")), - person("Jon", "Harmon", , "jonthegeek@gmail.com", role = "ctb"), + person("Jon", "Harmon", , "jonthegeek@gmail.com", role = c("aut", "cre"), + comment = c(ORCID = "0000-0003-4781-4346")), + person("Ellis", "Hughes", , "ellishughes@live.com", role = "aut"), person("Thomas", "Mock", , "j.thomasmock@gmail.com", role = "ctb"), person("Data Science Learning Community", , , "tidytuesday@dslc.io", role = "dtc") ) @@ -20,25 +21,29 @@ BugReports: https://github.com/thebioengineer/tidytuesdayR/issues Depends: R (>= 3.4.0) Imports: - httr, + cli, + gh, + glue, jsonlite, lubridate (>= 1.7.0), magrittr, - purrr (>= 0.2.5), + purrr (>= 1.0.0), readr (>= 1.0.0), - readxl (>= 1.0.0), rlang, - rstudioapi (>= 0.2), rvest (>= 0.3.2), + tidyr, tools (>= 3.1.0), usethis, xml2 (>= 1.2.0) Suggests: covr, pkgdown, + readxl (>= 1.0.0), + rstudioapi (>= 0.2), testthat (>= 3.0.0), tibble, withr +Config/testthat/edition: 3 Encoding: UTF-8 +Roxygen: list(markdown = TRUE) RoxygenNote: 7.3.2 -Config/testthat/edition: 3 diff --git a/NAMESPACE b/NAMESPACE index d624406..101ed79 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,12 +4,8 @@ S3method(print,tt) S3method(print,tt_data) S3method(print,tt_dataset_table) S3method(print,tt_dataset_table_list) -S3method(tt_gh_error,response) -S3method(tt_gh_error,tt_response) export("%>%") -export(github_pat) export(last_tuesday) -export(rate_limit_check) export(readme) export(tt_available) export(tt_datasets) @@ -18,40 +14,12 @@ export(tt_download_file) export(tt_load) export(tt_load_gh) export(use_tidytemplate) -importFrom(httr,GET) -importFrom(httr,add_headers) -importFrom(jsonlite,base64_dec) -importFrom(jsonlite,base64_enc) -importFrom(jsonlite,parse_json) -importFrom(lubridate,as_date) -importFrom(lubridate,day) -importFrom(lubridate,is.Date) -importFrom(lubridate,month) +importFrom(glue,glue) importFrom(lubridate,today) importFrom(lubridate,wday) -importFrom(lubridate,year) -importFrom(lubridate,ymd) importFrom(magrittr,"%>%") -importFrom(purrr,map) -importFrom(purrr,walk) -importFrom(readr,read_delim) -importFrom(readxl,read_xls) -importFrom(readxl,read_xlsx) -importFrom(rlang,"%||%") -importFrom(rstudioapi,isAvailable) -importFrom(rstudioapi,viewer) -importFrom(rvest,html_node) -importFrom(rvest,html_nodes) -importFrom(rvest,html_table) importFrom(stats,aggregate) -importFrom(stats,na.omit) importFrom(stats,na.pass) importFrom(stats,setNames) -importFrom(tools,file_ext) importFrom(tools,file_path_sans_ext) importFrom(usethis,use_template) -importFrom(utils,URLencode) -importFrom(utils,browseURL) -importFrom(utils,read.csv) -importFrom(xml2,read_html) -importFrom(xml2,write_html) diff --git a/NEWS.md b/NEWS.md index 24cccc3..5ee9b05 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,10 +1,16 @@ -# tidytuesdayR (1.0.3 version) -* [bug fix] Address case where when trying to test +# tidytuesdayR (development version) -# tidytuesdayR (1.0.2 version) +* [maintenance] tidytuesdayR now uses the {gh} package to manage all interactions with the GitHub API. This should make the package more stable and easier to maintain. (@jonthegeek, #78) + + +# tidytuesdayR 1.0.3 + +* [bug fix] Address case where rate limit hit when trying to test + +# tidytuesdayR 1.0.2 * [bug fix] During testing it was identified that 502 errors from github servers would cause the code to error out. Now it will retry a few times before giving an error. -* [bug fix] No internet connection bug on rstudio resolved to due malformed url checks (https). +* [bug fix] No internet connection bug on rstudio resolved due to malformed url checks (https). * [bug fix] Partial argument matching correction in `tt_download_file.character()`, `tt_parse_blob()`, and in tests. (thanks @mgirlich) # tidytuesdayR 1.0.1 diff --git a/R/aaa-shared.R b/R/aaa-shared.R new file mode 100644 index 0000000..331ebbd --- /dev/null +++ b/R/aaa-shared.R @@ -0,0 +1,17 @@ +#' Parameters used in multiple functions +#' +#' Reused parameter definitions are gathered here for easier editing. +#' +#' @param auth A GitHub token. See [gh::gh_token()] for more details. +#' @param tt A `tt` object, output from [tt_load_gh()]. +#' @param files Which file names to download. Default "All" downloads all files +#' for the specified week. +#' @param week Which week number to use within a given year. Only used when `x` +#' is a valid year. +#' @param x The date of data to pull (in "YYYY-MM-dd" format), or the four-digit +#' year as a number. +#' @param year What year of TidyTuesday to use +#' +#' @name shared-params +#' @keywords internal +NULL diff --git a/R/github_api.R b/R/github_api.R index 8a578e5..b775e4f 100644 --- a/R/github_api.R +++ b/R/github_api.R @@ -1,541 +1,118 @@ -#' Read Contents from GitHub -#' -#' Provide tool to read raw data and return as text the raw data using the -#' github api -#' -#' @param path Relative path from within the TidyTuesday Repository -#' @param auth github PAT -#' -#' @return raw text of the content with the sha as an attribute -#' @noRd -#' @examplesIf interactive() -#' text_csv <- github_contents("data/2020/2020-04-07/tdf_stages.csv") -#' tour_de_france_stages <- readr::read_csv(text_csv) -github_contents <- function(path, auth = github_pat()) { - base_url <- file.path( - "https://api.github.com/repos", - options("tidytuesdayR.tt_repo"), - "contents", - path - ) - - github_blob(path, auth = auth) -} - -#' Read Contents from GitHub as html -#' -#' Provide tools to read and process readme's as html using the github api -#' -#' @param path Relative path from within the TidyTuesday Repository to contents -#' that can be returned as HTML -#' @param ... optional arguments to pass to \code{read_html} -#' @param auth github PAT -#' -#' @return result of read_html on the contents -#' @noRd -#' -#' @examplesIf interactive() -#' main_readme <- github_html("README.md") -#' week_readme <- github_html("data/2020/2020-01-07/readme.md") -#' -#' @importFrom xml2 read_html -github_html <- function(path, - ..., - auth = github_pat()) { - base_url <- file.path( - "https://api.github.com/repos", - options("tidytuesdayR.tt_repo"), - "contents", - path - ) - - url_response <- github_GET( - base_url, - auth = auth, - Accept = "application/vnd.github.v3.html" - ) - - if (url_response$status_code == 200) { - github_page(read_html(x = url_response$content, ...)) - } else { - stop(tt_gh_error(url_response)$message) - } - } - - -#' Read Contents from GitHub as html -#' -#' provide tools to read and process readme's as html using the github api -#' -#' @param dirpath Relative path from within the TidyTuesday Repository to -#' folder of contents wanting sha for -#' @param branch which branch to get sha for. assumed to be -#' master (and usually should be) -#' @param auth github PAT. See PAT section for more information -#' -#' @return result data.frame of SHA and other information of directory contents -#' @noRd -#' -#' @examplesIf interactive() -#' sha <- github_sha("data/2020/2020-01-07") -#' -#' @importFrom xml2 read_html -#' @importFrom utils URLencode -github_sha <- function(dirpath, - branch = "master", - auth = github_pat()) { - if (dirpath == ".") { - dirpath <- "" - } - - base_url <- file.path( - "https://api.github.com/repos", - options("tidytuesdayR.tt_repo"), - "git/trees", - URLencode(paste(branch, dirpath, sep = ":")) - ) +# Hit the api ---- - url_response <- github_GET(base_url, auth = auth) - - if (url_response$status_code == 200) { - url_json <- GET_json(url_response) - do.call( - "rbind", - lapply( - url_json$tree, - function(x) { - data.frame( - x[c("path", "sha")], - stringsAsFactors = FALSE - ) - } - ) - ) - } else { - stop(tt_gh_error(url_response)$message) - } - } +# This one is purely a wrapper around gh::gh(), so at least for now we'll trust +# that it works. +# +# nocov start -#' Read blob Contents from GitHub +#' Get data from the tt github repo. #' -#' provide tools to read and process blob's using the github api +#' @param path Path within the rfordatascience/tidytuesday repo. +#' @param ... Additional parameters passed to [gh::gh()]. #' -#' @param path Relative path from within the TidyTuesday Repository to contents, -#' usually because it was too large to be read with the contencts api. -#' @param as_raw optional arguments to pass to \code{read_html} -#' @param sha sha of object if known in liu of path (usually best to give both -#' for clarity) -#' @param auth github PAT -#' -#' @return a raw/character object based on the blob -#' @noRd -#' -#' @examplesIf interactive() -#' main_readme_blob <- github_blob("README.md", as_raw = TRUE) -github_blob <- function(path, as_raw = FALSE, sha = NULL, auth = github_pat()) { - if (is.null(sha)) { - dir_sha <- github_sha(dirname(path)) - sha <- dir_sha$sha[dir_sha$path == basename(path)] - if (identical(sha, character(0))) { - stop("Response Code 404: Not Found") - } - } - - base_url <- file.path( - "https://api.github.com/repos", - options("tidytuesdayR.tt_repo"), - "git/blobs", - sha - ) - - url_response <- github_GET( - base_url, - auth = auth, - Accept = "application/vnd.github.VERSION.raw" - ) - - if (url_response$status_code == 200) { - if (as_raw == TRUE) { - content <- url_response$content - } else { - content <- rawToChar(url_response$content) - } - attr(content, ".sha") <- sha - return(content) - } else { - stop(tt_gh_error(url_response)$message) - } - } - - -#' read json base64 contents from github -#' -#' provide tool to read and process data using the github api -#' @param b64 base64 character value to be decoded and converted to -#' character value -#' @importFrom jsonlite base64_dec -#' -#' @return a character vector of the input decoded from base64 -#' @noRd -#' -#' @examples -#' # Returns the value "Hello World" -#' base_64_to_char("SGVsbG8gV29ybGQ=") -base_64_to_char <- function(b64) { - rawToChar(base64_dec(b64)) -} - -#' read GET json contents to char -#' -#' provide tool to read and process data using the github api from GET command -#' @param get_response object of class "response" from GET command. returns -#' JSON value. -#' -#' @return a list object if the content json -#' @noRd -#' -#' @importFrom jsonlite parse_json -GET_json <- function(get_response) { - jsonlite::parse_json(rawToChar(get_response$content)) -} - - -#' Create shell for HTML content from github -#' -#' Provide the necessary
section to wrap around raw html content read -#' from github. -#' -#' @param page_content html content in xml_document class -#' -#' @return xml_document with github header -#' @noRd -#' -#' @importFrom xml2 read_html -#' @importFrom rvest html_nodes -github_page <- function(page_content) { - header <- paste0( - "" +#' @return The GitHub response as parsed by [gh::gh()]. +#' @keywords internal +gh_get <- function(path, auth = gh::gh_token(), ...) { + gh::gh( + "/repos/:tt_repo/contents/:path", + path = path, + tt_repo = getOption("tidytuesdayR.tt_repo"), + .token = auth, + ... ) - - body <- page_content %>% - html_nodes("body") %>% - as.character() %>% - enc2native() - - read_html(paste0(header, body)) } +# nocov end -#' Return the local user's GitHub Personal Access Token -#' -#' Extract the GitHub Personal Access Token (PAT) from the system environment -#' for authenticated requests. -#' -#' @section PAT: -#' -#' A Github 'PAT' is a Personal Access Token. This allows for signed queries -#' to the github api, and increases the limit on the number of requests -#' allowed from 60 to 5000. Follow instructions from -#'Archive of datasets and articles from the 2018 series of #TidyTuesday
events.
Data comes from The Economist GitHub. The following information was taken directly from their GitHub readme.
+ +These are the data behind the "space launches" article, The space race is dominated by new contenders.
+Principal data came from the Jonathan McDowell's JSR Launch Vehicle Database, available online at http://www.planet4589.org/space/lvdb/index.html.
+ +File | +Contents | +Source | +
---|---|---|
agencies | +Space launch providers | +Jonathan McDowell; The Economist + | +
launches | +Individual space launches | +Jonathan McDowell; The Economist + | +
variable | +definition | +
---|---|
tag | +Harvard or COSPAR id of launch | +
JD | ++Julian Date of launch | +
launch_date | +date of launch | +
launch_year | +year of launch | +
type | +type of launch vehicle | +
variant | +variant of launch vehicle | +
mission | ++ |
agency | +launching agency | +
state_code | +launching agency's state | +
category | +success (O) or failure (F) | +
agency_type | +type of agency | +
variable | +definition | +
---|---|
agency | +org phase code | +
count | +number of launches | +
ucode | +org Ucode | +
state_code | +responsible state | +
type | +type of org | +
class | +class of org | +
tstart | +org/phase founding date | +
tstop | +org/phase ending date | +
short_name | +short name | +
name | +full name | +
location | +plain english location | +
longitude | ++ |
latitude | ++ |
error | +uncertainty in long/lat | +
parent | +parent org | +
short_english_name | +english short name | +
english_name | +english full name | +
unicode_name | +unicode full name | +
agency_type | +type of agency | +
"Seattle Department of Transportation has 12 bike counters (four of which also count pedestrians) located on neighborhood greenways, multi-use trails, at the Fremont Bridge and on SW Spokane Street. The counters are helping us create a ridership baseline in 2014 that can be used to assess future years and make sure our investments are helping us to reach our goal of quadrupling ridership by 2030. Read our Bicycle Master Plan to learn more about what Seattle is doing to create a citywide bicycle network."
+The Seattle Times recently covered What we can learn from Seattle's bike-counter data. They have some elegant data-visualizations there!
+ +bike_traffic <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-04-02/bike_traffic.csv")
+
variable | +class | +description | +
---|---|---|
date | +date (mdy hms am/pm) | +Date of data upload | +
crossing | +character | +The Street crossing/intersection | +
direction | +character | +North/South/East/West - varies by crossing | +
bike_count | +double | +Number of bikes counted for each hour window | +
ped_count | +double | +Number of pedestrians counted for each hour window | +
A few exciting Tennis-related datasets this week, all of which come courtesy of Wikipedia! "All records are based on data from the Association of Tennis Professionals (ATP), the International Tennis Federation (ITF), and the official websites of the four Grand Slam tournaments." - Wikipedia
+"The Open Era is the current era of professional tennis. It began in 1968 when the Grand Slam tournaments allowed professional players to compete with amateurs, ending the division that had persisted since the dawn of the sport in the 19th century." - Wikipedia
+"The Grand Slam tournaments, also called majors, are the four most important annual tennis events. They offer the most ranking points, prize money, public and media attention, the greatest strength and size of field, and greater number of "best of" sets for men. The Grand Slam itinerary consists of the Australian Open in mid January, the French Open around late May through early June, Wimbledon in June-July, and the US Open in August-September. Each tournament is played over a two-week period. The Australian and United States tournaments are played on hard courts, the French on clay, and Wimbledon on grass." - Wikipedia
+The court surface could be an interesting additional piece of data that I have left out, you could add it in with some clever case_when()
calls.
I have tamed the datasets pretty thoroughly but there are several ways you could combine, summarize, or otherwise plot the data for this week. I have a spoiler/hint at the bottom if you get stuck with combining! I also left my relatively rough cleaning and data collection .rmd in, and have uploaded it.
+ +The Grand Slam tournaments happen in a rough 2-week timeframe as mentioned above, however I was unable to find a nice dataset that covered the specific day of both the men's and women's finals. As such I have used a static date that estimates the date of the championship match, and likely has an error of a few days for each tournament. However, this is still useful for determining the approximate age of the player at each tournament.
+ +Women's Timeline
Men's Timeline
Date of Birth/First Title
Women's Singles Champs
Men's Singles Champs
The Financial Times Article has lots of great inspiration plots, but uses different datasets. Specifically the author used match wins vs tournament wins and tennis rankings over time rather than tournament placing. The author John Burn-Murdoch is a great follow for DataViz and visual storytelling resources, including R and D3.
+Additionally a gist and Tweet from John Burn-Murdoch goes through the process of collecting some men's tennis data, and then how he iterated across several plots. It's worth taking a look at for either code-inspiration for this week or as a general example of plot iteration for publication.
+ +player_dob <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-04-09/player_dob.csv")
+
+grand_slams <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-04-09/grand_slams.csv")
+
+grand_slam_timeline <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-04-09/grand_slam_timeline.csv")
+
grand_slam_timeline
variable | +class | +description | +
---|---|---|
player | +character | +Player Name | +
year | +integer | +Tournament Year | +
tournament | +character | +Tournament name | +
outcome | +character | +Outcome - where player was eliminated, was absent, or won - there are some NAs | +
gender | +character | +Male/Female for this dataset | +
grand_slams
variable | +class | +description | +
---|---|---|
year | +integer | +Tournament Year | +
grand_slam | +character | +Tournament name | +
name | +character | +Player Name | +
rolling_win_count | +integer | +Rolling win = cumulative sum of wins across time/player | +
tournament_date | +date | ++Approximate Tournament Date (ymd) | +
gender | +character | +Male/Female for this dataset | +
player_dob
variable | +class | +description | +
---|---|---|
name | +character | +Player name | +
grand_slam | +character | +Tournament for first major title win | +
date_of_birth | +date | +date of birth (ymd) | +
date_of_first_title | +date | +date of first major title win (ymd) | +
age | +double | +Age in days | +
To get the tournament performance at age rather than simply across time we need to join
the Date of Birth dataset with the grandslam dataset.
age_slams_comb <- left_join(grand_slams, player_dob, by = c("name")) %>%
+ mutate(age = tournament_date - date_of_birth) %>% # needs to be datetime
+ group_by(name, age, gender) %>%
+ summarize(counts = n()) %>%
+ group_by(name) %>%
+ mutate(total_wins = cumsum(counts)) %>%
+ arrange(desc(total_wins))
+
+# test plot
+age_slams_comb %>%
+ ggplot(aes(x = age, y = total_wins, group = name)) +
+ geom_point() +
+ geom_step() +
+ facet_wrap(~gender)
+
A weekly data project aimed at the R ecosystem. As this project was borne out of the R4DS Online Learning Community
and the R for Data Science
textbook, an emphasis was placed on understanding how to summarize and arrange data to make meaningful charts with ggplot2
, tidyr
, dplyr
, and other tools in the tidyverse
ecosystem. However, any code-based methodology is welcome - just please remember to share the code used to generate the results.
Join the R4DS Online Learning Community
in the weekly #TidyTuesday
event! Every week we post a raw dataset, a chart or article related to that dataset, and ask you to explore the data. While the dataset will be "tamed", it will not always be tidy! As such you might need to apply various R for Data Science
techniques to wrangle the data into a true tidy format. The goal of TidyTuesday
is to apply your R skills, get feedback, explore other's work, and connect with the greater #RStats
community! As such we encourage everyone of all skills to participate!
We will have many sources of data and want to emphasize that no causation is implied. There are various moderating variables that affect all data, many of which might not have been captured in these datasets. As such, our guidelines are to use the data provided to practice your data tidying and plotting techniques. Participants are invited to consider for themselves what nuancing factors might underlie these relationships.
+The intent of Tidy Tuesday is to provide a safe and supportive forum for individuals to practice their wrangling and data visualization skills independent of drawing conclusions. While we understand that the two are related, the focus of this practice is purely on building skills with real-world data.
+All data will be posted on the data sets page on Monday. It will include the link to the original article (for context) and to the data set.
+We welcome all newcomers, enthusiasts, and experts to participate, but be mindful of a few things:
+#RStats
practitioners or their code! Be supportive and kind to each other! Like other's posts and help promote the #RStats
community!Want to submit an interesting dataset? Please open an Issue and post a link to the article (or blogpost, etc) using the data, then we can discuss adding it to a future TidyTuesday Event!
+Link | +Description | +
---|---|
Link | +The R4DS Online Learning Community Website | +
Link | +The R for Data Science textbook | +
Link | +Carbon for sharing beautiful code pics | +
Link | +Post gist to Carbon from RStudio | +
Link | +Post to Carbon from RStudio | +
Link | +Join GitHub! | +
Link | +Basics of GitHub | +
Link | +Learn how to use GitHub with R | +
Link | +Save high-rez ggplot2 images |
+
Link | +Description | +
---|---|
Link | +Data is Plural collection | +
Link | +BuzzFeedNews GitHub | +
Link | +The Economist GitHub | +
Link | +The fivethirtyeight data package |
+
Link | +The Upshot by NY Times | +
Link | +The Baltimore Sun Data Desk | +
Link | +The LA Times Data Desk | +
Link | +Open News Labs | +
Link | +BBC Data Journalism team | +
Only books available freely online are sourced here. Feel free to add to the list
+Link | +Description | +
---|---|
Link | +Fundamentals of Data Viz by Claus Wilke | +
Link | +The Art of Data Science by Roger D. Peng & Elizabeth Matsui | +
Link | +Tidy Text Mining by Julia Silge & David Robinson | +
Link | +Geocomputation with R by Robin Lovelace, Jakub Nowosad, Jannes Muenchow | +
Link | +Data Visualization by Kieran Healy | +
Link | +
+ggplot2 cookbook by Winston Chang |
+
Link | +BBC Data Journalism team | +
h/t to Bob Rudis for sharing the data source, and to Roel Hogervorst for the guide to scraping this data. He provided the bulk of the scraping code, and I added bit of additional data cleaning. The data this week comes from Privacy Affairs.
+I have also included all the raw text (gdpr_text.tsv
) for the actual GDPR legal documents, in case someone was interested in parsing through them or using them along with the violations.
Per Wikipedia GDPR is:
+++ +The General Data Protection Regulation (EU) 2016/679 (GDPR) is a regulation in EU law on data protection and privacy in the European Union (EU) and the European Economic Area (EEA). It also addresses the transfer of personal data outside the EU and EEA areas. The GDPR aims primarily to give control to individuals over their personal data and to simplify the regulatory environment for international business by unifying the regulation within the EU.[1] Superseding the Data Protection Directive 95/46/EC, the regulation contains provisions and requirements related to the processing of personal data of individuals (formally called data subjects in the GDPR) who reside in the EEA, and applies to any enterprise—regardless of its location and the data subjects' citizenship or residence—that is processing the personal information of data subjects inside the EEA.
+
# Get the Data
+
+gdpr_violations <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-04-21/gdpr_violations.tsv')
+gdpr_text <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-04-21/gdpr_text.tsv')
+
+# Or read in with tidytuesdayR package (https://github.com/thebioengineer/tidytuesdayR)
+# PLEASE NOTE TO USE 2020 DATA YOU NEED TO USE the tidytuesdayR version after Jan 2020.
+
+# Either ISO-8601 date or year/week works!
+
+# Install via devtools::install_github("thebioengineer/tidytuesdayR")
+
+tuesdata <- tidytuesdayR::tt_load('2020-04-21')
+tuesdata <- tidytuesdayR::tt_load(2020, week = 17)
+
+
+gdpr_violations <- tuesdata$gdpr_violations
+
variable | +class | +description | +
---|---|---|
id | +integer | +Idetifier for fine/violation | +
picture | +character | +SVG image of violation country flag | +
name | +character | +Name of country where violation was enforced | +
price | +integer | +Fine price in Euros (€) | +
authority | +character | +Authority that enacted the violation | +
date | +character | +Date of violation | +
controller | +character | +Controller of data - the violator | +
article_violated | +character | +Specific GDPR Article violated (see the gdpr_text.tsv data for specifics) |
+
type | +character | +Type of violation | +
source | +character | +Original source (URL) of fine data | +
summary | +character | +Summary of violation | +
variable | +class | +description | +
---|---|---|
chapter | +double | +GDPR Chapter Number | +
chapter_title | +character | +Chapter title | +
article | +double | +GDPR Article number | +
article_title | +character | +Article title | +
sub_article | +double | +Sub article number | +
gdpr_text | +character | +Raw text of article/subarticle | +
href | +character | +URL to the raw text itself | +
library(tidyverse)
+library(rvest)
+
+# Note the following code was adapted from
+# https://blog.rmhogervorst.nl/blog/2020/04/08/scraping-gdpr-fines/
+
+link <- "https://www.privacyaffairs.com/gdpr-fines/"
+page <- read_html(link)
+
+
+temp <- page %>% html_nodes("script") %>%
+ .[9] %>%
+ rvest::html_text()
+
+ends <- str_locate_all(temp, "\\]")
+starts <- str_locate_all(temp, "\\[")
+
+table1 <- temp %>%
+ stringr::str_sub(start = starts[[1]][1,2], end = ends[[1]][1,1]) %>%
+ str_remove_all("\\\n") %>%
+ str_remove_all("\\\r") %>%
+ jsonlite::fromJSON() %>%
+ as_tibble() %>%
+ mutate(summary = str_remove_all(summary,"<p>|</p>|\n"))
+
+
+table2 <- temp %>%
+ stringr::str_sub(start = starts[[1]][2,2], end = ends[[1]][2,1]) %>%
+ str_remove_all("\\\n") %>%
+ str_remove_all("\\\r") %>%
+ jsonlite::fromJSON() %>%
+ as_tibble() %>%
+ mutate(summary = str_remove_all(summary,"<p>|</p>|\n"))
+
+
+all_df <- bind_rows(table1, table2) %>%
+ janitor::clean_names() %>%
+ mutate(
+ authority = str_remove(authority, "\t"),
+ article_violated = str_remove(article_violated, '<a href="https://www.privacy-regulation.eu/en/32.htm">') %>%
+ str_remove('</a>'),
+ article_violated = str_replace_all(article_violated, ", Art", "|Art"),
+ type = str_remove(type, '<a href="https://www.privacy-regulation.eu/en/32.htm">') %>%
+ str_remove('</a>')
+ )
+
+# most frequent articles violated
+all_df %>%
+ separate_rows(article_violated, sep = "\\|") %>%
+ count(article_violated, sort = T)
+
+all_df %>%
+ write_tsv("2020/2020-04-21/gdpr_violations.tsv")
+
+
+# Getting the actual article text -----------------------------------------
+
+raw_article <- "https://gdpr-info.eu/" %>%
+ read_html()
+
+# Get all the urls for specific articles/chapters
+gdpr_href <- raw_article %>%
+ html_node(xpath = '//*[@id="tablepress-12"]') %>%
+ html_nodes("a") %>%
+ html_attr("href")
+
+# pull the titles as well
+gdpr_titles <- raw_article %>%
+ html_node(xpath = '//*[@id="tablepress-12"]') %>%
+ html_nodes("a") %>%
+ html_attr("data-title")
+
+# pull the numbers of article/chapters
+gdpr_numbers <- raw_article %>%
+ html_node(xpath = '//*[@id="tablepress-12"]') %>%
+ html_nodes("a") %>%
+ html_text()
+
+# put it all into a df
+gdpr_df <- tibble(
+ article = gdpr_numbers,
+ title = str_trim(gdpr_titles),
+ href = gdpr_href
+)
+
+# Tidy up the data, create chapters vs articles
+clean_gdpr <- gdpr_df %>%
+ mutate(chapter = if_else(str_length(article) > 3, article, NA_character_),
+ chapter_title = if_else(str_length(article) > 3, title, NA_character_)) %>%
+ fill(chapter, chapter_title) %>%
+ filter(!str_detect(article, "Chapter")) %>%
+ mutate(article = as.double(article)) %>%
+ filter(!is.na(article)) %>%
+ select(starts_with("chapter"), article, article_title = title, href)
+
+clean_gdpr
+
+# LONG running outcome
+# Get all the raw html from each of the urls for each article
+all_articles <- clean_gdpr %>%
+ mutate(raw_html = map(href, read_html))
+
+# function to take raw html and turn it into text for that specific article
+get_gdpr_text <- function(html_in){
+
+ test_var <- html_in %>%
+ html_node(".entry-content") %>%
+ html_nodes("ol") %>%
+ html_text()
+
+ if (length(test_var) == 0){
+ text <- html_in %>%
+ html_node(".entry-content > p") %>%
+ html_text() %>%
+ str_remove("^[:digit:]")
+ } else {
+ text <- html_in %>%
+ html_node(".entry-content") %>%
+ html_nodes("ol") %>%
+ html_text() %>%
+ .[[1]] %>%
+ str_replace_all(";\n", "\t") %>%
+ str_replace_all(":\n", "\t") %>%
+ str_split("\n") %>%
+ .[[1]] %>%
+ .[. != ""] %>%
+ str_replace_all("\t", "\n") %>%
+ str_remove("^[:digit:]")
+ }
+
+
+ text
+
+}
+
+# Test
+get_gdpr_text(read_html("http://gdpr-info.eu/art-2-gdpr/"))
+
+# unnest the list column of text
+clean_articles <- all_articles %>%
+ mutate(gdpr_text = map(raw_html, get_gdpr_text)) %>%
+ unnest_longer(gdpr_text)
+
+# final dataframe
+final_articles <- clean_articles %>%
+ group_by(article) %>%
+ mutate(sub_article = row_number()) %>%
+ relocate(sub_article, .after = "article_title") %>%
+ relocate(gdpr_text, .after = "sub_article") %>%
+ ungroup() %>%
+ mutate(chapter = str_extract(chapter, "[:digit:]+")) %>%
+ mutate_at(vars(chapter, article, sub_article), as.double) %>%
+ select(-raw_html)
+
+final_articles %>% view()
+
+write_tsv(final_articles, "2020/2020-04-21/gdpr_text.tsv")
+
+
Archive of datasets and articles from the 2020 series of #TidyTuesday
events.
Archive of datasets and articles from the 2021 series of #TidyTuesday
events.
Please add alt text (alternative text) to all of your posted graphics for #TidyTuesday
.
Twitter provides guidelines for how to add alt text to your images.
+The DataViz Society/Nightingale by way of Amy Cesal has an article on writing good alt text for plots/graphs.
+++Here's a simple formula for writing alt text for data visualization:
+ +It's helpful for people with partial sight to know what chart type it is and gives context for understanding the rest of the visual. Example: Line graph
+ +What data is included in the chart? The x and y axis labels may help you figure this out. Example: number of bananas sold per day in the last year
+ +Think about why you're including this visual. What does it show that's meaningful. There should be a point to every visual and you should tell people what to look for. Example: the winter months have more banana sales
+ +Don't include this in your alt text, but it should be included somewhere in the surrounding text. People should be able to click on a link to view the source data or dig further into the visual. This provides transparency about your source and lets people explore the data. Example: Data from the USDA
+
Penn State has an article on writing alt text descriptions for charts and tables.
+++Charts, graphs and maps use visuals to convey complex images to users. But since they are images, these media provide serious accessibility issues to colorblind users and users of screen readers. See the examples on this page for details on how to make charts more accessible.
+
The {rtweet}
package includes the ability to post tweets with alt text programatically.
Need a reminder? There are extensions that force you to remember to add Alt Text to Tweets with media.
+ +The data this week comes from Post45 Data by way of Sara Stoudt.
+See their Data Description for full details.
+++ +Each peer-reviewed dataset has an accompanying curatorial statement, which provides an overview of the data that explains its contents, construction, and some possible uses.
+Please cite data from the Post45 Data Collective and use the following six components in your citation:
++
+- +
+author name(s)
+- +
+date published in the Post45 repository
+- +
+title
+- +
+global persistent > identifier: [DOI](https://nam11.safelinks.protection.outlook.com/?url=http%3A%2F> %2Fwww.doi.org%2F&data=04%7C01%7Ckayla.shipp.kamibayashi%40emory.edu%7C31069808f> 9074a2a59ad08d8ce96abd0%7Ce004fb9cb0a4424fbcd0322606d5df38%7C0%7C0%7C63748649381> 8763862%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1h> aWwiLCJXVCI6Mn0%3D%7C1000&sdata=3EOVEbBZrdRlmq3n%2BFQkyf8cEK7jdW99oo8LFoSqXEo%3D> &reserved=0 "Original URL: http://www.doi.org/. Click or tap if you trust this > link.")
+- +
+Post45 Data Collective
+- +
+version number
+Example replication data citation from The Program Era Project, Kelly, White, and Glass, 2021:
+Kelly, Nicholas; White, Nicole, Glass, Loren, 03/01/2021, "The Program Era > Project," DOI: https://doi.org/10.18737/CNJV1733p4520210415, Post45 Data > Collective, V1.
+
# Get the Data
+
+# Read in with tidytuesdayR package
+# Install from CRAN via: install.packages("tidytuesdayR")
+# This loads the readme and all the datasets for the week of interest
+
+# Either ISO-8601 date or year/week works!
+
+tuesdata <- tidytuesdayR::tt_load('2022-05-10')
+tuesdata <- tidytuesdayR::tt_load(2022, week = 19)
+
+nyt_titles <- tuesdata$nyt_titles
+
+# Or read in the data manually
+
+nyt_titles <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-05-10/nyt_titles.tsv')
+nyt_full <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-05-10/nyt_full.tsv')
+
+
This is a subset/summary of the larger dataset.
+variable | +class | +description | +
---|---|---|
id | +double | +Book id | +
title | +character | +Title of book | +
author | +character | +Author | +
year | +double | +Year | +
total_weeks | +double | +Total weeks on best sellers list | +
first_week | +double | +First week on list (date) | +
debut_rank | +double | +Debut rank on the list | +
best_rank | +double | +Best rank | +
variable | +class | +description | +
---|---|---|
year | +double | +Year | +
week | +double | +Week (as date) | +
rank | +double | +Rank (1 > 18) | +
title_id | +double | +Title ID | +
title | +character | +Title of book | +
author | +character | +Author | +
Archive of datasets and articles from the 2022 series of #TidyTuesday
events.
Archive of datasets and articles from the 2023 series of #TidyTuesday
events.
Archive of datasets and articles from the 2024 series of #TidyTuesday
events.