diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 0f2fe08..fc57a73 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -2,9 +2,9 @@ # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: push: - branches: [main, master] + branches: [main] pull_request: - branches: [main, master] + branches: [main] name: R-CMD-check diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index f67debf..de4a17f 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -2,9 +2,9 @@ # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: push: - branches: [main, master] + branches: [main] pull_request: - branches: [main, master] + branches: [main] release: types: [published] workflow_dispatch: diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml index fefc52e..e7f5dfd 100644 --- a/.github/workflows/test-coverage.yaml +++ b/.github/workflows/test-coverage.yaml @@ -2,9 +2,9 @@ # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: push: - branches: [main, master] + branches: [main] pull_request: - branches: [main, master] + branches: [main] name: test-coverage diff --git a/DESCRIPTION b/DESCRIPTION index 231e22d..4fc65e4 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -15,11 +15,11 @@ Description: 'TidyTuesday' is a project by the 'Data Science Learning people to analyze and visualize. This package provides the tools to easily download this data and the description of the source. License: MIT + file LICENSE -URL: https://thebioengineer.github.io/tidytuesdayR/, - https://github.com/thebioengineer/tidytuesdayR -BugReports: https://github.com/thebioengineer/tidytuesdayR/issues +URL: https://dslc-io.github.io/tidytuesdayR/, + https://github.com/dslc-io/tidytuesdayR +BugReports: https://github.com/dslc-io/tidytuesdayR/issues Depends: - R (>= 3.4.0) + R (>= 3.5.0) Imports: cli, gh, diff --git a/man/tidytuesdayR-package.Rd b/man/tidytuesdayR-package.Rd index 7210484..6e0576b 100644 --- a/man/tidytuesdayR-package.Rd +++ b/man/tidytuesdayR-package.Rd @@ -13,9 +13,9 @@ \seealso{ Useful links: \itemize{ - \item \url{https://thebioengineer.github.io/tidytuesdayR/} - \item \url{https://github.com/thebioengineer/tidytuesdayR} - \item Report bugs at \url{https://github.com/thebioengineer/tidytuesdayR/issues} + \item \url{https://dslc-io.github.io/tidytuesdayR/} + \item \url{https://github.com/dslc-io/tidytuesdayR} + \item Report bugs at \url{https://github.com/dslc-io/tidytuesdayR/issues} } } diff --git a/readme.md b/readme.md index ce84409..2975248 100644 --- a/readme.md +++ b/readme.md @@ -4,13 +4,13 @@ Ellis Hughes [![CRAN status](https://www.r-pkg.org/badges/version/tidytuesdayR)](https://CRAN.R-project.org/package=tidytuesdayR) -[![R build status](https://github.com/thebioengineer/tidytuesdayR/workflows/R-CMD-check/badge.svg)](https://github.com/thebioengineer/tidytuesdayR/actions) +[![R build status](https://github.com/dslc-io/tidytuesdayR/workflows/R-CMD-check/badge.svg)](https://github.com/dslc-io/tidytuesdayR/actions) [![Coverage -status](https://codecov.io/gh/thebioengineer/tidytuesdayR/branch/master/graph/badge.svg)](https://app.codecov.io/github/thebioengineer/tidytuesdayR?branch=master) +status](https://codecov.io/gh/dslc-io/tidytuesdayR/branch/main/graph/badge.svg)](https://app.codecov.io/github/dslc-io/tidytuesdayR?branch=main) [![Downloads from the RStudio CRAN mirror](http://cranlogs.r-pkg.org/badges/tidytuesdayR)](https://cran.r-project.org/package=tidytuesdayR) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) -[![R-CMD-check](https://github.com/thebioengineer/tidytuesdayR/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/thebioengineer/tidytuesdayR/actions/workflows/R-CMD-check.yaml) +[![R-CMD-check](https://github.com/dslc-io/tidytuesdayR/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/dslc-io/tidytuesdayR/actions/workflows/R-CMD-check.yaml) {tidytuesdayR} has the main goal to make it easy to participate in the @@ -30,8 +30,8 @@ install.packages("tidytuesdayR") To get the latest in-development features, install the development version from GitHub: ``` r -#install.packages("remotes") -remotes::install_github("thebioengineer/tidytuesdayR") +#install.packages("pak") +pak::pak("dslc-io/tidytuesdayR") ``` ## Usage diff --git a/tests/testthat/fixtures/readme2019.html b/tests/testthat/fixtures/readme2019.html index 9caf498..e8b8088 100644 --- a/tests/testthat/fixtures/readme2019.html +++ b/tests/testthat/fixtures/readme2019.html @@ -354,7 +354,7 @@

2019

2019-10-15 Car Fuel Economy EPA -Ellis Hughes +Ellis Hughes 43 diff --git a/tests/testthat/fixtures/readme2020-04-21.html b/tests/testthat/fixtures/readme2020-04-21.html index 26d1e15..76a07ae 100644 --- a/tests/testthat/fixtures/readme2020-04-21.html +++ b/tests/testthat/fixtures/readme2020-04-21.html @@ -19,12 +19,12 @@

Get the data here

gdpr_violations <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-04-21/gdpr_violations.tsv') gdpr_text <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-04-21/gdpr_text.tsv') -# Or read in with tidytuesdayR package (https://github.com/thebioengineer/tidytuesdayR) +# Or read in with tidytuesdayR package (https://github.com/dslc-io/tidytuesdayR) # PLEASE NOTE TO USE 2020 DATA YOU NEED TO USE the tidytuesdayR version after Jan 2020. # Either ISO-8601 date or year/week works! -# Install via devtools::install_github("thebioengineer/tidytuesdayR") +# Install via pak::pak("dslc-io/tidytuesdayR") tuesdata <- tidytuesdayR::tt_load('2020-04-21') tuesdata <- tidytuesdayR::tt_load(2020, week = 17) @@ -35,12 +35,12 @@

Get the data here

gdpr_violations <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-04-21/gdpr_violations.tsv') gdpr_text <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-04-21/gdpr_text.tsv') -# Or read in with tidytuesdayR package (https://github.com/thebioengineer/tidytuesdayR) +# Or read in with tidytuesdayR package (https://github.com/dslc-io/tidytuesdayR) # PLEASE NOTE TO USE 2020 DATA YOU NEED TO USE the tidytuesdayR version after Jan 2020. # Either ISO-8601 date or year/week works! -# Install via devtools::install_github("thebioengineer/tidytuesdayR") +# Install via pak::pak("dslc-io/tidytuesdayR") tuesdata <- tidytuesdayR::tt_load('2020-04-21') tuesdata <- tidytuesdayR::tt_load(2020, week = 17) @@ -180,72 +180,72 @@

Cleaning Script

page <- read_html(link) -temp <- page %>% html_nodes("script") %>% - .[9] %>% - rvest::html_text() +temp <- page %>% html_nodes("script") %>% + .[9] %>% + rvest::html_text() ends <- str_locate_all(temp, "\\]") starts <- str_locate_all(temp, "\\[") -table1 <- temp %>% - stringr::str_sub(start = starts[[1]][1,2], end = ends[[1]][1,1]) %>% - str_remove_all("\\\n") %>% +table1 <- temp %>% + stringr::str_sub(start = starts[[1]][1,2], end = ends[[1]][1,1]) %>% + str_remove_all("\\\n") %>% str_remove_all("\\\r") %>% - jsonlite::fromJSON() %>% - as_tibble() %>% + jsonlite::fromJSON() %>% + as_tibble() %>% mutate(summary = str_remove_all(summary,"<p>|</p>|\n")) -table2 <- temp %>% - stringr::str_sub(start = starts[[1]][2,2], end = ends[[1]][2,1]) %>% - str_remove_all("\\\n") %>% - str_remove_all("\\\r") %>% - jsonlite::fromJSON() %>% - as_tibble() %>% +table2 <- temp %>% + stringr::str_sub(start = starts[[1]][2,2], end = ends[[1]][2,1]) %>% + str_remove_all("\\\n") %>% + str_remove_all("\\\r") %>% + jsonlite::fromJSON() %>% + as_tibble() %>% mutate(summary = str_remove_all(summary,"<p>|</p>|\n")) -all_df <- bind_rows(table1, table2) %>% +all_df <- bind_rows(table1, table2) %>% janitor::clean_names() %>% mutate( authority = str_remove(authority, "\t"), - article_violated = str_remove(article_violated, '<a href="https://www.privacy-regulation.eu/en/32.htm">') %>% + article_violated = str_remove(article_violated, '<a href="https://www.privacy-regulation.eu/en/32.htm">') %>% str_remove('</a>'), article_violated = str_replace_all(article_violated, ", Art", "|Art"), - type = str_remove(type, '<a href="https://www.privacy-regulation.eu/en/32.htm">') %>% + type = str_remove(type, '<a href="https://www.privacy-regulation.eu/en/32.htm">') %>% str_remove('</a>') ) # most frequent articles violated -all_df %>% - separate_rows(article_violated, sep = "\\|") %>% +all_df %>% + separate_rows(article_violated, sep = "\\|") %>% count(article_violated, sort = T) -all_df %>% +all_df %>% write_tsv("2020/2020-04-21/gdpr_violations.tsv") # Getting the actual article text ----------------------------------------- -raw_article <- "https://gdpr-info.eu/" %>% +raw_article <- "https://gdpr-info.eu/" %>% read_html() # Get all the urls for specific articles/chapters -gdpr_href <- raw_article %>% - html_node(xpath = '//*[@id="tablepress-12"]') %>% - html_nodes("a") %>% +gdpr_href <- raw_article %>% + html_node(xpath = '//*[@id="tablepress-12"]') %>% + html_nodes("a") %>% html_attr("href") # pull the titles as well -gdpr_titles <- raw_article %>% - html_node(xpath = '//*[@id="tablepress-12"]') %>% - html_nodes("a") %>% +gdpr_titles <- raw_article %>% + html_node(xpath = '//*[@id="tablepress-12"]') %>% + html_nodes("a") %>% html_attr("data-title") # pull the numbers of article/chapters -gdpr_numbers <- raw_article %>% - html_node(xpath = '//*[@id="tablepress-12"]') %>% - html_nodes("a") %>% +gdpr_numbers <- raw_article %>% + html_node(xpath = '//*[@id="tablepress-12"]') %>% + html_nodes("a") %>% html_text() # put it all into a df @@ -253,75 +253,75 @@

Cleaning Script

article = gdpr_numbers, title = str_trim(gdpr_titles), href = gdpr_href -) +) # Tidy up the data, create chapters vs articles -clean_gdpr <- gdpr_df %>% +clean_gdpr <- gdpr_df %>% mutate(chapter = if_else(str_length(article) > 3, article, NA_character_), - chapter_title = if_else(str_length(article) > 3, title, NA_character_)) %>% - fill(chapter, chapter_title) %>% - filter(!str_detect(article, "Chapter")) %>% - mutate(article = as.double(article)) %>% - filter(!is.na(article)) %>% + chapter_title = if_else(str_length(article) > 3, title, NA_character_)) %>% + fill(chapter, chapter_title) %>% + filter(!str_detect(article, "Chapter")) %>% + mutate(article = as.double(article)) %>% + filter(!is.na(article)) %>% select(starts_with("chapter"), article, article_title = title, href) clean_gdpr # LONG running outcome # Get all the raw html from each of the urls for each article -all_articles <- clean_gdpr %>% +all_articles <- clean_gdpr %>% mutate(raw_html = map(href, read_html)) # function to take raw html and turn it into text for that specific article get_gdpr_text <- function(html_in){ - - test_var <- html_in %>% - html_node(".entry-content") %>% - html_nodes("ol") %>% + + test_var <- html_in %>% + html_node(".entry-content") %>% + html_nodes("ol") %>% html_text() - + if (length(test_var) == 0){ text <- html_in %>% - html_node(".entry-content > p") %>% - html_text() %>% - str_remove("^[:digit:]") + html_node(".entry-content > p") %>% + html_text() %>% + str_remove("^[:digit:]") } else { - text <- html_in %>% - html_node(".entry-content") %>% - html_nodes("ol") %>% - html_text() %>% - .[[1]] %>% - str_replace_all(";\n", "\t") %>% - str_replace_all(":\n", "\t") %>% - str_split("\n") %>% - .[[1]] %>% - .[. != ""] %>% - str_replace_all("\t", "\n") %>% + text <- html_in %>% + html_node(".entry-content") %>% + html_nodes("ol") %>% + html_text() %>% + .[[1]] %>% + str_replace_all(";\n", "\t") %>% + str_replace_all(":\n", "\t") %>% + str_split("\n") %>% + .[[1]] %>% + .[. != ""] %>% + str_replace_all("\t", "\n") %>% str_remove("^[:digit:]") } - - + + text - + } # Test get_gdpr_text(read_html("http://gdpr-info.eu/art-2-gdpr/")) # unnest the list column of text -clean_articles <- all_articles %>% - mutate(gdpr_text = map(raw_html, get_gdpr_text)) %>% +clean_articles <- all_articles %>% + mutate(gdpr_text = map(raw_html, get_gdpr_text)) %>% unnest_longer(gdpr_text) # final dataframe -final_articles <- clean_articles %>% - group_by(article) %>% - mutate(sub_article = row_number()) %>% - relocate(sub_article, .after = "article_title") %>% - relocate(gdpr_text, .after = "sub_article") %>% - ungroup() %>% - mutate(chapter = str_extract(chapter, "[:digit:]+")) %>% - mutate_at(vars(chapter, article, sub_article), as.double) %>% +final_articles <- clean_articles %>% + group_by(article) %>% + mutate(sub_article = row_number()) %>% + relocate(sub_article, .after = "article_title") %>% + relocate(gdpr_text, .after = "sub_article") %>% + ungroup() %>% + mutate(chapter = str_extract(chapter, "[:digit:]+")) %>% + mutate_at(vars(chapter, article, sub_article), as.double) %>% select(-raw_html) final_articles %>% view() @@ -337,72 +337,72 @@

Cleaning Script

page <- read_html(link) -temp <- page %>% html_nodes("script") %>% - .[9] %>% - rvest::html_text() +temp <- page %>% html_nodes("script") %>% + .[9] %>% + rvest::html_text() ends <- str_locate_all(temp, "\\]") starts <- str_locate_all(temp, "\\[") -table1 <- temp %>% - stringr::str_sub(start = starts[[1]][1,2], end = ends[[1]][1,1]) %>% - str_remove_all("\\\n") %>% +table1 <- temp %>% + stringr::str_sub(start = starts[[1]][1,2], end = ends[[1]][1,1]) %>% + str_remove_all("\\\n") %>% str_remove_all("\\\r") %>% - jsonlite::fromJSON() %>% - as_tibble() %>% + jsonlite::fromJSON() %>% + as_tibble() %>% mutate(summary = str_remove_all(summary,"<p>|</p>|\n")) -table2 <- temp %>% - stringr::str_sub(start = starts[[1]][2,2], end = ends[[1]][2,1]) %>% - str_remove_all("\\\n") %>% - str_remove_all("\\\r") %>% - jsonlite::fromJSON() %>% - as_tibble() %>% +table2 <- temp %>% + stringr::str_sub(start = starts[[1]][2,2], end = ends[[1]][2,1]) %>% + str_remove_all("\\\n") %>% + str_remove_all("\\\r") %>% + jsonlite::fromJSON() %>% + as_tibble() %>% mutate(summary = str_remove_all(summary,"<p>|</p>|\n")) -all_df <- bind_rows(table1, table2) %>% +all_df <- bind_rows(table1, table2) %>% janitor::clean_names() %>% mutate( authority = str_remove(authority, "\t"), - article_violated = str_remove(article_violated, '<a href="https://www.privacy-regulation.eu/en/32.htm">') %>% + article_violated = str_remove(article_violated, '<a href="https://www.privacy-regulation.eu/en/32.htm">') %>% str_remove('</a>'), article_violated = str_replace_all(article_violated, ", Art", "|Art"), - type = str_remove(type, '<a href="https://www.privacy-regulation.eu/en/32.htm">') %>% + type = str_remove(type, '<a href="https://www.privacy-regulation.eu/en/32.htm">') %>% str_remove('</a>') ) # most frequent articles violated -all_df %>% - separate_rows(article_violated, sep = "\\|") %>% +all_df %>% + separate_rows(article_violated, sep = "\\|") %>% count(article_violated, sort = T) -all_df %>% +all_df %>% write_tsv("2020/2020-04-21/gdpr_violations.tsv") # Getting the actual article text ----------------------------------------- -raw_article <- "https://gdpr-info.eu/" %>% +raw_article <- "https://gdpr-info.eu/" %>% read_html() # Get all the urls for specific articles/chapters -gdpr_href <- raw_article %>% - html_node(xpath = '//*[@id="tablepress-12"]') %>% - html_nodes("a") %>% +gdpr_href <- raw_article %>% + html_node(xpath = '//*[@id="tablepress-12"]') %>% + html_nodes("a") %>% html_attr("href") # pull the titles as well -gdpr_titles <- raw_article %>% - html_node(xpath = '//*[@id="tablepress-12"]') %>% - html_nodes("a") %>% +gdpr_titles <- raw_article %>% + html_node(xpath = '//*[@id="tablepress-12"]') %>% + html_nodes("a") %>% html_attr("data-title") # pull the numbers of article/chapters -gdpr_numbers <- raw_article %>% - html_node(xpath = '//*[@id="tablepress-12"]') %>% - html_nodes("a") %>% +gdpr_numbers <- raw_article %>% + html_node(xpath = '//*[@id="tablepress-12"]') %>% + html_nodes("a") %>% html_text() # put it all into a df @@ -410,75 +410,75 @@

Cleaning Script

article = gdpr_numbers, title = str_trim(gdpr_titles), href = gdpr_href -) +) # Tidy up the data, create chapters vs articles -clean_gdpr <- gdpr_df %>% +clean_gdpr <- gdpr_df %>% mutate(chapter = if_else(str_length(article) > 3, article, NA_character_), - chapter_title = if_else(str_length(article) > 3, title, NA_character_)) %>% - fill(chapter, chapter_title) %>% - filter(!str_detect(article, "Chapter")) %>% - mutate(article = as.double(article)) %>% - filter(!is.na(article)) %>% + chapter_title = if_else(str_length(article) > 3, title, NA_character_)) %>% + fill(chapter, chapter_title) %>% + filter(!str_detect(article, "Chapter")) %>% + mutate(article = as.double(article)) %>% + filter(!is.na(article)) %>% select(starts_with("chapter"), article, article_title = title, href) clean_gdpr # LONG running outcome # Get all the raw html from each of the urls for each article -all_articles <- clean_gdpr %>% +all_articles <- clean_gdpr %>% mutate(raw_html = map(href, read_html)) # function to take raw html and turn it into text for that specific article get_gdpr_text <- function(html_in){ - - test_var <- html_in %>% - html_node(".entry-content") %>% - html_nodes("ol") %>% + + test_var <- html_in %>% + html_node(".entry-content") %>% + html_nodes("ol") %>% html_text() - + if (length(test_var) == 0){ text <- html_in %>% - html_node(".entry-content > p") %>% - html_text() %>% - str_remove("^[:digit:]") + html_node(".entry-content > p") %>% + html_text() %>% + str_remove("^[:digit:]") } else { - text <- html_in %>% - html_node(".entry-content") %>% - html_nodes("ol") %>% - html_text() %>% - .[[1]] %>% - str_replace_all(";\n", "\t") %>% - str_replace_all(":\n", "\t") %>% - str_split("\n") %>% - .[[1]] %>% - .[. != ""] %>% - str_replace_all("\t", "\n") %>% + text <- html_in %>% + html_node(".entry-content") %>% + html_nodes("ol") %>% + html_text() %>% + .[[1]] %>% + str_replace_all(";\n", "\t") %>% + str_replace_all(":\n", "\t") %>% + str_split("\n") %>% + .[[1]] %>% + .[. != ""] %>% + str_replace_all("\t", "\n") %>% str_remove("^[:digit:]") } - - + + text - + } # Test get_gdpr_text(read_html("http://gdpr-info.eu/art-2-gdpr/")) # unnest the list column of text -clean_articles <- all_articles %>% - mutate(gdpr_text = map(raw_html, get_gdpr_text)) %>% +clean_articles <- all_articles %>% + mutate(gdpr_text = map(raw_html, get_gdpr_text)) %>% unnest_longer(gdpr_text) # final dataframe -final_articles <- clean_articles %>% - group_by(article) %>% - mutate(sub_article = row_number()) %>% - relocate(sub_article, .after = "article_title") %>% - relocate(gdpr_text, .after = "sub_article") %>% - ungroup() %>% - mutate(chapter = str_extract(chapter, "[:digit:]+")) %>% - mutate_at(vars(chapter, article, sub_article), as.double) %>% +final_articles <- clean_articles %>% + group_by(article) %>% + mutate(sub_article = row_number()) %>% + relocate(sub_article, .after = "article_title") %>% + relocate(gdpr_text, .after = "sub_article") %>% + ungroup() %>% + mutate(chapter = str_extract(chapter, "[:digit:]+")) %>% + mutate_at(vars(chapter, article, sub_article), as.double) %>% select(-raw_html) final_articles %>% view()