Skip to content

Commit

Permalink
add CRANberries and de-duplication to curatinator
Browse files Browse the repository at this point in the history
  • Loading branch information
jonocarroll committed Oct 21, 2023
1 parent 8f01ece commit ed0e4d2
Show file tree
Hide file tree
Showing 2 changed files with 89 additions and 2 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/actions.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ on:
- cron: '0 9 * * 6' # At 09:00 on Saturday UTC

jobs:
import-data:
auto-collect-content:
runs-on: ubuntu-latest
steps:
- name: Set up R
Expand All @@ -26,6 +26,7 @@ jobs:
any::tidyverse
any::tidyRSS
any::urltools
any::pkgsearch
- name: Check out repository
uses: actions/checkout@v3
Expand Down
88 changes: 87 additions & 1 deletion scripts/curatinator.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,93 @@
OUTPUT_FILE <- "curatinator_latest.md"

## scrape RSS feeds for new posts in the last 10 days

source("scripts/get_rss.R")
f <- read.csv("rss_feeds.csv")
f <- f[f$ENABLE == 1, , drop = FALSE]
x <- get_rss_posts(f$URL)

cat(x, file = "curatinator_latest.md", sep = "/n")
cat(x, file = OUTPUT_FILE, sep = "/n")

## scrape CRANberries for new/updated packages in the last 7 days

library(tidyRSS)
library(pkgsearch)
library(dplyr)
library(lubridate)

# create utility functions
tidy_package_name <- function(item_link) {
pkg_tmp <- strsplit(item_link, "#")[[1]][2]
res <- strsplit(pkg_tmp, "_")[[1]][1]
return(res)
}

create_text <- function(package_version, package_name, package_title, feed_type = "new", add_diffify = TRUE, add_update_titles = FALSE) {
if (feed_type == "new") {
placeholder <- "+ [{|package_name|} |package_version|](https://cran.r-project.org/package=|package_name|): |package_title|"
} else if (feed_type == "updated") {
placeholder <- "+ [{|package_name|} |package_version|](https://cran.r-project.org/package=|package_name|)"
if (add_update_titles == TRUE){
placeholder <- stringr::str_c(placeholder, ": |package_title|")
}
if (add_diffify == TRUE){
placeholder <- stringr::str_c(placeholder, " - [diffify](https://diffify.com/R/|package_name|)")
}
} else {
stop("supplied type not recognized!", call. = FALSE)
}
x <- glue::glue(placeholder, .open = "|", .close = "|")
return(x)
}

process_cranberries <- function(feed_type, start_date, end_date = as.Date(lubridate::now())) {
# form the URL based on type (either "new" or "updated")
cb_url <- glue::glue("https://dirk.eddelbuettel.com/cranberries/cran/{feed_type}/index.rss")
ftype <- feed_type

# process the new packages feed
cb_tidy <- tidyRSS::tidyfeed(cb_url) %>%
mutate(feed_type = ftype) %>%
mutate(item_pub_date = as_date(item_pub_date)) %>%
select(item_title, item_link, item_description, item_pub_date, feed_type) %>%
distinct() %>%
mutate(package_name = purrr::map_chr(item_link, ~tidy_package_name(.x))) %>%

# leverage the awesome pkgsearch package to get metadata
mutate(package_meta = purrr::map(package_name, ~pkgsearch::cran_package(.x)),
package_version = purrr::map_chr(package_meta, "Version"),
package_title = purrr::map_chr(package_meta, "Title"),
package_date = purrr::map_chr(package_meta, "Date/Publication")) %>%

filter(item_pub_date >= as.Date(start_date) & item_pub_date <= as.Date(now())) %>%
select(one_of(c("package_version", "package_name", "package_title", "feed_type", "package_date"))) %>%
mutate(markdown_string = purrr::pmap_chr(select(., -package_date), create_text, add_update_titles = TRUE))

return(cb_tidy)
}


# obtain updated packages
cb_updated_df <- process_cranberries(feed_type = "updated", start_date = as.Date(Sys.Date() - 10))

cat("\n## CRANberries UPDATED: ##\n", file = OUTPUT_FILE, sep = "\n", append = TRUE)

# print out the markdown text
cat(cb_updated_df$markdown_string, file = OUTPUT_FILE, sep = "\n", append = TRUE)

# obtain new packages
cb_new_df <- process_cranberries(feed_type = "new", start_date = as.Date(Sys.Date() - 10))

cat("\n## CRANberries NEW: ##\n", file = OUTPUT_FILE, sep = "\n", append = TRUE)

# print out the markdown text
cat(cb_new_df$markdown_string, file = OUTPUT_FILE, sep = "\n", append = TRUE)

cat("\n", file = OUTPUT_FILE, append = TRUE)

## De-duplicate

collected <- readLines(file(OUTPUT_FILE))

cat(unique(collected), file = OUTPUT_FILE, sep = "\n")

0 comments on commit ed0e4d2

Please sign in to comment.