diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml
index 0f2fe08..fc57a73 100644
--- a/.github/workflows/R-CMD-check.yaml
+++ b/.github/workflows/R-CMD-check.yaml
@@ -2,9 +2,9 @@
# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
on:
push:
- branches: [main, master]
+ branches: [main]
pull_request:
- branches: [main, master]
+ branches: [main]
name: R-CMD-check
diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml
index f67debf..de4a17f 100644
--- a/.github/workflows/pkgdown.yaml
+++ b/.github/workflows/pkgdown.yaml
@@ -2,9 +2,9 @@
# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
on:
push:
- branches: [main, master]
+ branches: [main]
pull_request:
- branches: [main, master]
+ branches: [main]
release:
types: [published]
workflow_dispatch:
diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml
index fefc52e..e7f5dfd 100644
--- a/.github/workflows/test-coverage.yaml
+++ b/.github/workflows/test-coverage.yaml
@@ -2,9 +2,9 @@
# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
on:
push:
- branches: [main, master]
+ branches: [main]
pull_request:
- branches: [main, master]
+ branches: [main]
name: test-coverage
diff --git a/DESCRIPTION b/DESCRIPTION
index 231e22d..4fc65e4 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -15,11 +15,11 @@ Description: 'TidyTuesday' is a project by the 'Data Science Learning
people to analyze and visualize. This package provides the tools to
easily download this data and the description of the source.
License: MIT + file LICENSE
-URL: https://thebioengineer.github.io/tidytuesdayR/,
- https://github.com/thebioengineer/tidytuesdayR
-BugReports: https://github.com/thebioengineer/tidytuesdayR/issues
+URL: https://dslc-io.github.io/tidytuesdayR/,
+ https://github.com/dslc-io/tidytuesdayR
+BugReports: https://github.com/dslc-io/tidytuesdayR/issues
Depends:
- R (>= 3.4.0)
+ R (>= 3.5.0)
Imports:
cli,
gh,
diff --git a/man/tidytuesdayR-package.Rd b/man/tidytuesdayR-package.Rd
index 7210484..6e0576b 100644
--- a/man/tidytuesdayR-package.Rd
+++ b/man/tidytuesdayR-package.Rd
@@ -13,9 +13,9 @@
\seealso{
Useful links:
\itemize{
- \item \url{https://thebioengineer.github.io/tidytuesdayR/}
- \item \url{https://github.com/thebioengineer/tidytuesdayR}
- \item Report bugs at \url{https://github.com/thebioengineer/tidytuesdayR/issues}
+ \item \url{https://dslc-io.github.io/tidytuesdayR/}
+ \item \url{https://github.com/dslc-io/tidytuesdayR}
+ \item Report bugs at \url{https://github.com/dslc-io/tidytuesdayR/issues}
}
}
diff --git a/readme.md b/readme.md
index ce84409..2975248 100644
--- a/readme.md
+++ b/readme.md
@@ -4,13 +4,13 @@ Ellis Hughes
[![CRAN status](https://www.r-pkg.org/badges/version/tidytuesdayR)](https://CRAN.R-project.org/package=tidytuesdayR)
-[![R build status](https://github.com/thebioengineer/tidytuesdayR/workflows/R-CMD-check/badge.svg)](https://github.com/thebioengineer/tidytuesdayR/actions)
+[![R build status](https://github.com/dslc-io/tidytuesdayR/workflows/R-CMD-check/badge.svg)](https://github.com/dslc-io/tidytuesdayR/actions)
[![Coverage
-status](https://codecov.io/gh/thebioengineer/tidytuesdayR/branch/master/graph/badge.svg)](https://app.codecov.io/github/thebioengineer/tidytuesdayR?branch=master)
+status](https://codecov.io/gh/dslc-io/tidytuesdayR/branch/main/graph/badge.svg)](https://app.codecov.io/github/dslc-io/tidytuesdayR?branch=main)
[![Downloads from the RStudio CRAN mirror](http://cranlogs.r-pkg.org/badges/tidytuesdayR)](https://cran.r-project.org/package=tidytuesdayR)
[![License:
MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
-[![R-CMD-check](https://github.com/thebioengineer/tidytuesdayR/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/thebioengineer/tidytuesdayR/actions/workflows/R-CMD-check.yaml)
+[![R-CMD-check](https://github.com/dslc-io/tidytuesdayR/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/dslc-io/tidytuesdayR/actions/workflows/R-CMD-check.yaml)
{tidytuesdayR} has the main goal to make it easy to participate in the
@@ -30,8 +30,8 @@ install.packages("tidytuesdayR")
To get the latest in-development features, install the development version from GitHub:
``` r
-#install.packages("remotes")
-remotes::install_github("thebioengineer/tidytuesdayR")
+#install.packages("pak")
+pak::pak("dslc-io/tidytuesdayR")
```
## Usage
diff --git a/tests/testthat/fixtures/readme2019.html b/tests/testthat/fixtures/readme2019.html
index 9caf498..e8b8088 100644
--- a/tests/testthat/fixtures/readme2019.html
+++ b/tests/testthat/fixtures/readme2019.html
@@ -354,7 +354,7 @@
2019-10-15 |
Car Fuel Economy |
EPA |
-Ellis Hughes |
+Ellis Hughes |
43 |
diff --git a/tests/testthat/fixtures/readme2020-04-21.html b/tests/testthat/fixtures/readme2020-04-21.html
index 26d1e15..76a07ae 100644
--- a/tests/testthat/fixtures/readme2020-04-21.html
+++ b/tests/testthat/fixtures/readme2020-04-21.html
@@ -19,12 +19,12 @@ Get the data here
gdpr_violations <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-04-21/gdpr_violations.tsv')
gdpr_text <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-04-21/gdpr_text.tsv')
-# Or read in with tidytuesdayR package (https://github.com/thebioengineer/tidytuesdayR)
+# Or read in with tidytuesdayR package (https://github.com/dslc-io/tidytuesdayR)
# PLEASE NOTE TO USE 2020 DATA YOU NEED TO USE the tidytuesdayR version after Jan 2020.
# Either ISO-8601 date or year/week works!
-# Install via devtools::install_github("thebioengineer/tidytuesdayR")
+# Install via pak::pak("dslc-io/tidytuesdayR")
tuesdata <- tidytuesdayR::tt_load('2020-04-21')
tuesdata <- tidytuesdayR::tt_load(2020, week = 17)
@@ -35,12 +35,12 @@ Get the data here
gdpr_violations <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-04-21/gdpr_violations.tsv')
gdpr_text <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-04-21/gdpr_text.tsv')
-# Or read in with tidytuesdayR package (https://github.com/thebioengineer/tidytuesdayR)
+# Or read in with tidytuesdayR package (https://github.com/dslc-io/tidytuesdayR)
# PLEASE NOTE TO USE 2020 DATA YOU NEED TO USE the tidytuesdayR version after Jan 2020.
# Either ISO-8601 date or year/week works!
-# Install via devtools::install_github("thebioengineer/tidytuesdayR")
+# Install via pak::pak("dslc-io/tidytuesdayR")
tuesdata <- tidytuesdayR::tt_load('2020-04-21')
tuesdata <- tidytuesdayR::tt_load(2020, week = 17)
@@ -180,72 +180,72 @@ Cleaning Script
page <- read_html(link)
-temp <- page %>% html_nodes("script") %>%
- .[9] %>%
- rvest::html_text()
+temp <- page %>% html_nodes("script") %>%
+ .[9] %>%
+ rvest::html_text()
ends <- str_locate_all(temp, "\\]")
starts <- str_locate_all(temp, "\\[")
-table1 <- temp %>%
- stringr::str_sub(start = starts[[1]][1,2], end = ends[[1]][1,1]) %>%
- str_remove_all("\\\n") %>%
+table1 <- temp %>%
+ stringr::str_sub(start = starts[[1]][1,2], end = ends[[1]][1,1]) %>%
+ str_remove_all("\\\n") %>%
str_remove_all("\\\r") %>%
- jsonlite::fromJSON() %>%
- as_tibble() %>%
+ jsonlite::fromJSON() %>%
+ as_tibble() %>%
mutate(summary = str_remove_all(summary,"<p>|</p>|\n"))
-table2 <- temp %>%
- stringr::str_sub(start = starts[[1]][2,2], end = ends[[1]][2,1]) %>%
- str_remove_all("\\\n") %>%
- str_remove_all("\\\r") %>%
- jsonlite::fromJSON() %>%
- as_tibble() %>%
+table2 <- temp %>%
+ stringr::str_sub(start = starts[[1]][2,2], end = ends[[1]][2,1]) %>%
+ str_remove_all("\\\n") %>%
+ str_remove_all("\\\r") %>%
+ jsonlite::fromJSON() %>%
+ as_tibble() %>%
mutate(summary = str_remove_all(summary,"<p>|</p>|\n"))
-all_df <- bind_rows(table1, table2) %>%
+all_df <- bind_rows(table1, table2) %>%
janitor::clean_names() %>%
mutate(
authority = str_remove(authority, "\t"),
- article_violated = str_remove(article_violated, '<a href="https://www.privacy-regulation.eu/en/32.htm">') %>%
+ article_violated = str_remove(article_violated, '<a href="https://www.privacy-regulation.eu/en/32.htm">') %>%
str_remove('</a>'),
article_violated = str_replace_all(article_violated, ", Art", "|Art"),
- type = str_remove(type, '<a href="https://www.privacy-regulation.eu/en/32.htm">') %>%
+ type = str_remove(type, '<a href="https://www.privacy-regulation.eu/en/32.htm">') %>%
str_remove('</a>')
)
# most frequent articles violated
-all_df %>%
- separate_rows(article_violated, sep = "\\|") %>%
+all_df %>%
+ separate_rows(article_violated, sep = "\\|") %>%
count(article_violated, sort = T)
-all_df %>%
+all_df %>%
write_tsv("2020/2020-04-21/gdpr_violations.tsv")
# Getting the actual article text -----------------------------------------
-raw_article <- "https://gdpr-info.eu/" %>%
+raw_article <- "https://gdpr-info.eu/" %>%
read_html()
# Get all the urls for specific articles/chapters
-gdpr_href <- raw_article %>%
- html_node(xpath = '//*[@id="tablepress-12"]') %>%
- html_nodes("a") %>%
+gdpr_href <- raw_article %>%
+ html_node(xpath = '//*[@id="tablepress-12"]') %>%
+ html_nodes("a") %>%
html_attr("href")
# pull the titles as well
-gdpr_titles <- raw_article %>%
- html_node(xpath = '//*[@id="tablepress-12"]') %>%
- html_nodes("a") %>%
+gdpr_titles <- raw_article %>%
+ html_node(xpath = '//*[@id="tablepress-12"]') %>%
+ html_nodes("a") %>%
html_attr("data-title")
# pull the numbers of article/chapters
-gdpr_numbers <- raw_article %>%
- html_node(xpath = '//*[@id="tablepress-12"]') %>%
- html_nodes("a") %>%
+gdpr_numbers <- raw_article %>%
+ html_node(xpath = '//*[@id="tablepress-12"]') %>%
+ html_nodes("a") %>%
html_text()
# put it all into a df
@@ -253,75 +253,75 @@ Cleaning Script
article = gdpr_numbers,
title = str_trim(gdpr_titles),
href = gdpr_href
-)
+)
# Tidy up the data, create chapters vs articles
-clean_gdpr <- gdpr_df %>%
+clean_gdpr <- gdpr_df %>%
mutate(chapter = if_else(str_length(article) > 3, article, NA_character_),
- chapter_title = if_else(str_length(article) > 3, title, NA_character_)) %>%
- fill(chapter, chapter_title) %>%
- filter(!str_detect(article, "Chapter")) %>%
- mutate(article = as.double(article)) %>%
- filter(!is.na(article)) %>%
+ chapter_title = if_else(str_length(article) > 3, title, NA_character_)) %>%
+ fill(chapter, chapter_title) %>%
+ filter(!str_detect(article, "Chapter")) %>%
+ mutate(article = as.double(article)) %>%
+ filter(!is.na(article)) %>%
select(starts_with("chapter"), article, article_title = title, href)
clean_gdpr
# LONG running outcome
# Get all the raw html from each of the urls for each article
-all_articles <- clean_gdpr %>%
+all_articles <- clean_gdpr %>%
mutate(raw_html = map(href, read_html))
# function to take raw html and turn it into text for that specific article
get_gdpr_text <- function(html_in){
-
- test_var <- html_in %>%
- html_node(".entry-content") %>%
- html_nodes("ol") %>%
+
+ test_var <- html_in %>%
+ html_node(".entry-content") %>%
+ html_nodes("ol") %>%
html_text()
-
+
if (length(test_var) == 0){
text <- html_in %>%
- html_node(".entry-content > p") %>%
- html_text() %>%
- str_remove("^[:digit:]")
+ html_node(".entry-content > p") %>%
+ html_text() %>%
+ str_remove("^[:digit:]")
} else {
- text <- html_in %>%
- html_node(".entry-content") %>%
- html_nodes("ol") %>%
- html_text() %>%
- .[[1]] %>%
- str_replace_all(";\n", "\t") %>%
- str_replace_all(":\n", "\t") %>%
- str_split("\n") %>%
- .[[1]] %>%
- .[. != ""] %>%
- str_replace_all("\t", "\n") %>%
+ text <- html_in %>%
+ html_node(".entry-content") %>%
+ html_nodes("ol") %>%
+ html_text() %>%
+ .[[1]] %>%
+ str_replace_all(";\n", "\t") %>%
+ str_replace_all(":\n", "\t") %>%
+ str_split("\n") %>%
+ .[[1]] %>%
+ .[. != ""] %>%
+ str_replace_all("\t", "\n") %>%
str_remove("^[:digit:]")
}
-
-
+
+
text
-
+
}
# Test
get_gdpr_text(read_html("http://gdpr-info.eu/art-2-gdpr/"))
# unnest the list column of text
-clean_articles <- all_articles %>%
- mutate(gdpr_text = map(raw_html, get_gdpr_text)) %>%
+clean_articles <- all_articles %>%
+ mutate(gdpr_text = map(raw_html, get_gdpr_text)) %>%
unnest_longer(gdpr_text)
# final dataframe
-final_articles <- clean_articles %>%
- group_by(article) %>%
- mutate(sub_article = row_number()) %>%
- relocate(sub_article, .after = "article_title") %>%
- relocate(gdpr_text, .after = "sub_article") %>%
- ungroup() %>%
- mutate(chapter = str_extract(chapter, "[:digit:]+")) %>%
- mutate_at(vars(chapter, article, sub_article), as.double) %>%
+final_articles <- clean_articles %>%
+ group_by(article) %>%
+ mutate(sub_article = row_number()) %>%
+ relocate(sub_article, .after = "article_title") %>%
+ relocate(gdpr_text, .after = "sub_article") %>%
+ ungroup() %>%
+ mutate(chapter = str_extract(chapter, "[:digit:]+")) %>%
+ mutate_at(vars(chapter, article, sub_article), as.double) %>%
select(-raw_html)
final_articles %>% view()
@@ -337,72 +337,72 @@ Cleaning Script
page <- read_html(link)
-temp <- page %>% html_nodes("script") %>%
- .[9] %>%
- rvest::html_text()
+temp <- page %>% html_nodes("script") %>%
+ .[9] %>%
+ rvest::html_text()
ends <- str_locate_all(temp, "\\]")
starts <- str_locate_all(temp, "\\[")
-table1 <- temp %>%
- stringr::str_sub(start = starts[[1]][1,2], end = ends[[1]][1,1]) %>%
- str_remove_all("\\\n") %>%
+table1 <- temp %>%
+ stringr::str_sub(start = starts[[1]][1,2], end = ends[[1]][1,1]) %>%
+ str_remove_all("\\\n") %>%
str_remove_all("\\\r") %>%
- jsonlite::fromJSON() %>%
- as_tibble() %>%
+ jsonlite::fromJSON() %>%
+ as_tibble() %>%
mutate(summary = str_remove_all(summary,"<p>|</p>|\n"))
-table2 <- temp %>%
- stringr::str_sub(start = starts[[1]][2,2], end = ends[[1]][2,1]) %>%
- str_remove_all("\\\n") %>%
- str_remove_all("\\\r") %>%
- jsonlite::fromJSON() %>%
- as_tibble() %>%
+table2 <- temp %>%
+ stringr::str_sub(start = starts[[1]][2,2], end = ends[[1]][2,1]) %>%
+ str_remove_all("\\\n") %>%
+ str_remove_all("\\\r") %>%
+ jsonlite::fromJSON() %>%
+ as_tibble() %>%
mutate(summary = str_remove_all(summary,"<p>|</p>|\n"))
-all_df <- bind_rows(table1, table2) %>%
+all_df <- bind_rows(table1, table2) %>%
janitor::clean_names() %>%
mutate(
authority = str_remove(authority, "\t"),
- article_violated = str_remove(article_violated, '<a href="https://www.privacy-regulation.eu/en/32.htm">') %>%
+ article_violated = str_remove(article_violated, '<a href="https://www.privacy-regulation.eu/en/32.htm">') %>%
str_remove('</a>'),
article_violated = str_replace_all(article_violated, ", Art", "|Art"),
- type = str_remove(type, '<a href="https://www.privacy-regulation.eu/en/32.htm">') %>%
+ type = str_remove(type, '<a href="https://www.privacy-regulation.eu/en/32.htm">') %>%
str_remove('</a>')
)
# most frequent articles violated
-all_df %>%
- separate_rows(article_violated, sep = "\\|") %>%
+all_df %>%
+ separate_rows(article_violated, sep = "\\|") %>%
count(article_violated, sort = T)
-all_df %>%
+all_df %>%
write_tsv("2020/2020-04-21/gdpr_violations.tsv")
# Getting the actual article text -----------------------------------------
-raw_article <- "https://gdpr-info.eu/" %>%
+raw_article <- "https://gdpr-info.eu/" %>%
read_html()
# Get all the urls for specific articles/chapters
-gdpr_href <- raw_article %>%
- html_node(xpath = '//*[@id="tablepress-12"]') %>%
- html_nodes("a") %>%
+gdpr_href <- raw_article %>%
+ html_node(xpath = '//*[@id="tablepress-12"]') %>%
+ html_nodes("a") %>%
html_attr("href")
# pull the titles as well
-gdpr_titles <- raw_article %>%
- html_node(xpath = '//*[@id="tablepress-12"]') %>%
- html_nodes("a") %>%
+gdpr_titles <- raw_article %>%
+ html_node(xpath = '//*[@id="tablepress-12"]') %>%
+ html_nodes("a") %>%
html_attr("data-title")
# pull the numbers of article/chapters
-gdpr_numbers <- raw_article %>%
- html_node(xpath = '//*[@id="tablepress-12"]') %>%
- html_nodes("a") %>%
+gdpr_numbers <- raw_article %>%
+ html_node(xpath = '//*[@id="tablepress-12"]') %>%
+ html_nodes("a") %>%
html_text()
# put it all into a df
@@ -410,75 +410,75 @@ Cleaning Script
article = gdpr_numbers,
title = str_trim(gdpr_titles),
href = gdpr_href
-)
+)
# Tidy up the data, create chapters vs articles
-clean_gdpr <- gdpr_df %>%
+clean_gdpr <- gdpr_df %>%
mutate(chapter = if_else(str_length(article) > 3, article, NA_character_),
- chapter_title = if_else(str_length(article) > 3, title, NA_character_)) %>%
- fill(chapter, chapter_title) %>%
- filter(!str_detect(article, "Chapter")) %>%
- mutate(article = as.double(article)) %>%
- filter(!is.na(article)) %>%
+ chapter_title = if_else(str_length(article) > 3, title, NA_character_)) %>%
+ fill(chapter, chapter_title) %>%
+ filter(!str_detect(article, "Chapter")) %>%
+ mutate(article = as.double(article)) %>%
+ filter(!is.na(article)) %>%
select(starts_with("chapter"), article, article_title = title, href)
clean_gdpr
# LONG running outcome
# Get all the raw html from each of the urls for each article
-all_articles <- clean_gdpr %>%
+all_articles <- clean_gdpr %>%
mutate(raw_html = map(href, read_html))
# function to take raw html and turn it into text for that specific article
get_gdpr_text <- function(html_in){
-
- test_var <- html_in %>%
- html_node(".entry-content") %>%
- html_nodes("ol") %>%
+
+ test_var <- html_in %>%
+ html_node(".entry-content") %>%
+ html_nodes("ol") %>%
html_text()
-
+
if (length(test_var) == 0){
text <- html_in %>%
- html_node(".entry-content > p") %>%
- html_text() %>%
- str_remove("^[:digit:]")
+ html_node(".entry-content > p") %>%
+ html_text() %>%
+ str_remove("^[:digit:]")
} else {
- text <- html_in %>%
- html_node(".entry-content") %>%
- html_nodes("ol") %>%
- html_text() %>%
- .[[1]] %>%
- str_replace_all(";\n", "\t") %>%
- str_replace_all(":\n", "\t") %>%
- str_split("\n") %>%
- .[[1]] %>%
- .[. != ""] %>%
- str_replace_all("\t", "\n") %>%
+ text <- html_in %>%
+ html_node(".entry-content") %>%
+ html_nodes("ol") %>%
+ html_text() %>%
+ .[[1]] %>%
+ str_replace_all(";\n", "\t") %>%
+ str_replace_all(":\n", "\t") %>%
+ str_split("\n") %>%
+ .[[1]] %>%
+ .[. != ""] %>%
+ str_replace_all("\t", "\n") %>%
str_remove("^[:digit:]")
}
-
-
+
+
text
-
+
}
# Test
get_gdpr_text(read_html("http://gdpr-info.eu/art-2-gdpr/"))
# unnest the list column of text
-clean_articles <- all_articles %>%
- mutate(gdpr_text = map(raw_html, get_gdpr_text)) %>%
+clean_articles <- all_articles %>%
+ mutate(gdpr_text = map(raw_html, get_gdpr_text)) %>%
unnest_longer(gdpr_text)
# final dataframe
-final_articles <- clean_articles %>%
- group_by(article) %>%
- mutate(sub_article = row_number()) %>%
- relocate(sub_article, .after = "article_title") %>%
- relocate(gdpr_text, .after = "sub_article") %>%
- ungroup() %>%
- mutate(chapter = str_extract(chapter, "[:digit:]+")) %>%
- mutate_at(vars(chapter, article, sub_article), as.double) %>%
+final_articles <- clean_articles %>%
+ group_by(article) %>%
+ mutate(sub_article = row_number()) %>%
+ relocate(sub_article, .after = "article_title") %>%
+ relocate(gdpr_text, .after = "sub_article") %>%
+ ungroup() %>%
+ mutate(chapter = str_extract(chapter, "[:digit:]+")) %>%
+ mutate_at(vars(chapter, article, sub_article), as.double) %>%
select(-raw_html)
final_articles %>% view()