refactoring load_bib

SimonGreenhill · Dec 2, 2024 · 8107abb · 8107abb
1 parent 17b872d
commit 8107abb
Show file tree

Hide file tree

Showing 9 changed files with 97 additions and 249 deletions.
diff --git a/R/rcldf.R b/R/rcldf.R
@@ -1,10 +1,10 @@
 #' Reads a Cross-Linguistic Data Format dataset into an object.
 #'
 #' @param mdpath the path to the directory or metadata JSON file.
-#' @param load_bib a boolean flag (TRUE/FALSE, default TRUE) to load the
+#' @param load_bib a boolean flag (TRUE/FALSE, default FALSE) to load the
 #'     sources.bib BibTeX file. `load_bib=FALSE` can easily speed up loading
-#'     of a CLDF dataset by an order of magnitude or two, so consider this if
-#'     you don't need access to the source information.
+#'     of a CLDF dataset by an order of magnitude or two, so we do not load
+#'     sources by default.
 #' @param cache_dir a directory to cache downloaded files to
 #' @return A `cldf` object
 #' @export
@@ -33,13 +33,13 @@ cldf <- function(mdpath, load_bib=TRUE, cache_dir=tools::R_user_dir("rcldf", whi
 
     for (i in 1:length(md$metadata$tables)) {
         tfile <- md$metadata$tables[[i]][['url']]
-        
+
         logger::log_debug("cldf: handle table ", tfile, namespace="cldf")
         table <- get_tablename(md$metadata$tables[[i]][['dc:conformsTo']], tfile)
         filename <- get_filename(o$base_dir, tfile)
-        
+
         if (table %in% names(o[["tables"]])) { stop(paste("Duplicate name: ", table)) }
-        
+
         if (!is.null(filename) && file.exists(filename)) {
             o[["tables"]][[table]] <- add_dataframe(md$metadata$tables[[i]], filename, md$metadata)
             o[["resources"]][[basename(tfile)]] <- table
@@ -54,7 +54,7 @@ cldf <- function(mdpath, load_bib=TRUE, cache_dir=tools::R_user_dir("rcldf", whi
         # `as_data_frame()` was deprecated in tibble 2.0.0.
         # this has not been updated since 2020, so we should replace it.
         # See: https://github.com/SimonGreenhill/rcldf/issues/13
-        o$sources <- read_bib(o)
+        o <- read_bib(o)
     }
 
     logger::log_debug("cldf: run nullify", namespace="cldf")

diff --git a/R/read_bib.R b/R/read_bib.R
@@ -8,8 +8,8 @@ read_bib <- function(object){
 
     bib <- get_filename(object$base_dir, object$metadata[['dc:source']])
 
-    if (is.null(bib)) { return(NA) }  # no bib defined
-    if (!file.exists(bib)) { return(NA) }  # file doesn't exist
+    if (is.null(bib)) { return(object) }  # no bib defined
+    if (!file.exists(bib)) { return(object) }  # file doesn't exist
     if (tolower(tools::file_ext(bib)) == 'zip') {
         logger::log_debug("load_bib - encountered zip file: ", bib, namespace="load_bib")
         # get original name (probably sources.bib)
@@ -24,5 +24,6 @@ read_bib <- function(object){
 
     # we suppress the warning `Column `YEAR` contains character strings.` as it
     # confuses users (it's actually a message not a warning)
-    return(suppressMessages(bib2df::bib2df(bib)))
+    object$sources <- suppressMessages(bib2df::bib2df(bib))
+    return(object)
 }
diff --git a/README.md b/README.md
@@ -14,18 +14,27 @@ library(devtools)
 install_github("SimonGreenhill/rcldf", dependencies = TRUE)
 ```
 
-## Example
+## Usage
 
-```r
-# create a `cldf` object giving either a path to the directory
-# or the metadata.json file, or a URL:
+### Load a CLDF dataset:
+
+You create a `cldf` object by giving either a path to the directory the CLDF
+is stored in or a URL where we can find the CLDF dataset.
+
+(i.e. where the _metadata.json_ file lives).
 
+```r
 > df <- cldf('/path/to/dir/wals_1a_cldf')
 > df <- cldf('/path/to/dir/wals_1a_cldf/StructureDataset-metadata.json')
 > df <- cldf("https://zenodo.org/record/7844558/files/grambank/grambank-v1.0.3.zip?download=1")
 > df <- cldf('https://github.com/phlorest/greenhill_et_al2023')
+```
+
+### Explore a CLDF dataset:
 
-# a cldf object has various bits of information
+A cldf object has various bits of information
+
+```r
 > summary(df)
 A Cross-Linguistic Data Format (CLDF) dataset:
 Name: My Dataset
@@ -36,13 +45,18 @@ Tables:
   3/4: ParameterTable (6 columns, 1 rows)
   4/4: ValueTable (7 columns, 563 rows)
 Sources: 947
+```
 
+Each table is attached to the _df$tables_ list:
 
-# each table is attached to the df$tables list.
+```r
 > names(df$tables)
 [1] "ValueTable"     "LanguageTable"  "ParameterTable" "CodeTable" 
+```
 
+...and we can access these tables:
 
+```r
 > df$tables$LanguageTable
 # A tibble: 563 x 9
    ID    Name   Macroarea Latitude Longitude Glottocode ISO639P3code Genus     Family   
@@ -52,12 +66,15 @@ Sources: 947
  3 ach   Aché   NA          -25.2      -55.2 ache1246   guq          Tupi-Gua… Tupian   
 
 
+# OR
 > df$tables$ParameterTable
 # A tibble: 1 x 6
   ID    Name                 Description Authors       Url                      Area    
   <chr> <chr>                <chr>       <chr>         <chr>                    <chr>   
 1 1A    Consonant Inventori… NA          Ian Maddieson http://wals.info/featur… Phonolo… 
 
+
+# OR
 > df$tables$ValueTable
 # A tibble: 563 x 7
    ID     Language_ID Parameter_ID Value Code_ID Comment Source                                       
@@ -67,20 +84,54 @@ Sources: 947
  3 1A-ach ach         1A           1     1A-1    NA      Susnik-1974                                  
  4 1A-acm acm         1A           2     1A-2    NA      Olmsted-1966;Olmsted-1964
 
-
+```
+
+### Load all the source information
+
+CLDF datasets have sources stored in BibTeX format. We don't load them by default,
+as it can take a long time to parse the BibTeX file correctly.
+
+You can load them like this:
+
+```r
+o <- cldf('/path/to/dir/wals_1a_cldf', load_bib=TRUE)
+# or if you loaded the CLDF without sources the first time.
+o <- read_bib(o)
+```
+
+...and then access them by:
+
+```r
+o$sources
+```
+
+
+
+### Construct a 'wide' table with all foreign key entries filled in:
+
+Sometimes people want to have all the data from a CLDF dataset as one dataframe.
+
+Use `as.cldf.wide` to do this, passing it the name of a table to act as the base.
+
+This will take the base table, and resolve all foreign keys (usually `*_ID`) into
+their own columns.
+
+For example, this dataset has a `CodeTable` which connects to the `ParameterTable` 
+via `Parameter_ID`:
+
+```r
 > df$tables$CodeTable
 # A tibble: 5 x 4
   ID    Parameter_ID Name             Description
-  <chr> <chr>        <chr>            <chr>      
-1 1A-1  1A           Small            NA         
-2 1A-2  1A           Moderately small NA         
-3 1A-3  1A           Average          NA         
-4 1A-4  1A           Moderately large NA         
-5 1A-5  1A           Large            NA         
-
+  <chr> <chr>        <chr>            <chr>
+1 1A-1  1A           Small            NA
+2 1A-2  1A           Moderately small NA
+```
 
+Using `as.cldf.wide` we can combine all the information from `ParameterTable` into
+the `CodeTable`:
 
-# You can extract a "wide" table, with all foreign key entries filled in:
+```r
 > as.cldf.wide(df, 'CodeTable')
 
 # A tibble: 5 x 9
@@ -92,34 +143,18 @@ Sources: 947
 4 1A-4  1A           Moderatel… a moderately la… Consonant Inve… NA               Ian Ma…
 5 1A-5  1A           Large      a large thing    Consonant Inve… NA               Ian Ma…
 # … with 2 more variables: Url <chr>, Area <chr>
+```
 
+Note that name clashes between the two tables are resolved by appending the tablename 
+(e.g. the column `Name` in the original `CodeTable` is now `Name.CodeTable`).
 
 
-# Or: 
-> as.cldf.wide(df, 'ValueTable')
-
-# A tibble: 9 x 23
-  ID    Language_ID Parameter_ID.Va… Value Code_ID Comment Source Name.LanguageTable
-  <chr> <chr>       <chr>            <chr> <chr>   <chr>   <chr>  <chr>         
-1 1A-a… abi         1A               2     1A-2    NA      Najli… Abipón        
-2 1A-a… abk         1A               5     1A-5    NA      Hewit… Abkhaz        
-3 1A-a… ach         1A               1     1A-1    NA      Susni… Aché          
-4 1A-a… acm         1A               2     1A-2    NA      Olmst… Achumawi      
-5 1A-a… aco         1A               5     1A-5    NA      Mille… Acoma         
-6 1A-a… adz         1A               2     1A-2    NA      Holzk… Adzera        
-7 1A-a… agh         1A               3     1A-3    NA      Hyman… Aghem         
-8 1A-a… aht         1A               4     1A-4    NA      Kari-… Ahtna         
-9 1A-a… aik         1A               3     1A-3    NA      Hanke… Aikaná        
-# … with 15 more variables: Macroarea <chr>, Latitude <dbl>, Longitude <dbl>,
-#   Glottocode <chr>, ISO639P3code <chr>, Genus <chr>, Family <chr>,
-#   Name.parameters <chr>, Description.ParameterTable <chr>, Authors <chr>, Url <chr>,
-#   Area <chr>, Parameter_ID.CodeTable <chr>, Name.CodeTable <chr>, Description.CodeTable <chr>
-
+### Load just one table:
 
-# If you just want to get one table:
+Sometimes you just want to get one table:
 
+```r
 df <- get_table_from('LanguageTable', '/path/to/dir/wals_1a_cldf')
-
 ```