From 00731aba71737e3d2af49a12f5cddf400d9c548f Mon Sep 17 00:00:00 2001 From: JakeTufts <69049500+JakeTufts@users.noreply.github.com> Date: Fri, 1 Sep 2023 19:06:57 +0100 Subject: [PATCH 01/13] Changed potential typo --- fprog.qmd | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fprog.qmd b/fprog.qmd index 3d392fc..40c6fc0 100644 --- a/fprog.qmd +++ b/fprog.qmd @@ -484,10 +484,10 @@ sqrt(-5) This only raises a warning and returns `NaN` (Not a Number). This can be quite dangerous, especially when working non-interactively, which is what we will be doing a lot later on. It is much better if a pipeline fails early due to an -error, than dragging a `NaN` value. This also happens with `sqrt()`: +error, than dragging a `NaN` value. This also happens with `log10()`: ```{r} -sqrt(-10) +log10(-10) ``` So it could be useful to redefine these functions to raise an error instead, for @@ -705,7 +705,6 @@ fact_iter <- function(n){ result = 1 for(i in 1:n){ result = result * i - i = i + 1 } result } From f41e206b9e3f49fac6ed87691968d5a803c2f523 Mon Sep 17 00:00:00 2001 From: JakeTufts <69049500+JakeTufts@users.noreply.github.com> Date: Sun, 3 Sep 2023 21:28:12 +0100 Subject: [PATCH 02/13] Add missing apostrophe to code --- lit_prog.qmd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lit_prog.qmd b/lit_prog.qmd index 84c9e92..6970bf8 100644 --- a/lit_prog.qmd +++ b/lit_prog.qmd @@ -753,7 +753,7 @@ create this. This is this function: ```{r} return_section <- function(dataset, var){ a <- knitr::knit_expand(text = c( - ## Frequency table for variable: {{variable}}", + "## Frequency table for variable: {{variable}}", create_table(dataset, var)), variable = var) cat(a, sep = "\n") @@ -984,7 +984,7 @@ that I recommend tick the following two important boxes: - Work the same way regardless of output format (Word, PDF or Html); - Work for any type of table: summary tables, regression tables, two-way tables, etc. -Let's start with the simplest type of table, which would is a table that simply +Let's start with the simplest type of table, which would be a table that simply shows some rows of data. `{knitr}` comes with the `kable()` function, but this function generates a very plain looking output. For something publication-worthy, we recommend the `{flextable}` package, developed by From e77370fbd20809552b5d17747901f8e6e7668497 Mon Sep 17 00:00:00 2001 From: JakeTufts <69049500+JakeTufts@users.noreply.github.com> Date: Wed, 6 Sep 2023 13:12:07 +0100 Subject: [PATCH 03/13] Correcting table position number for wiki scraping --- project_rewrite.qmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/project_rewrite.qmd b/project_rewrite.qmd index aead2da..3fdf274 100644 --- a/project_rewrite.qmd +++ b/project_rewrite.qmd @@ -352,7 +352,7 @@ We can scrape current communes: ```{r} get_current_communes <- function( url = "https://w.wiki/6nPu", - table_position = 1 + table_position = 2 ){ read_html(url) %>% From 869bcc29833c5f35426b7fc3db7a28875be38ec8 Mon Sep 17 00:00:00 2001 From: JakeTufts <69049500+JakeTufts@users.noreply.github.com> Date: Thu, 7 Sep 2023 12:44:50 +0100 Subject: [PATCH 04/13] Adding -la to bash command --- repro_intro.qmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/repro_intro.qmd b/repro_intro.qmd index e69dd24..fdb5669 100644 --- a/repro_intro.qmd +++ b/repro_intro.qmd @@ -216,7 +216,7 @@ can find in the `renv` folder. Let’s take a look at the contents of this folde ::: {.content-hidden when-format="pdf"} ```bash -owner@localhost ➤ ls renv +owner@localhost ➤ ls -la renv ``` ::: From d9d4010f2e56d55dd7b6b905a25bcb45ed822d4c Mon Sep 17 00:00:00 2001 From: JakeTufts <69049500+JakeTufts@users.noreply.github.com> Date: Mon, 11 Sep 2023 09:37:24 +0100 Subject: [PATCH 05/13] Minor typos and potential clearer code for installing specific branch --- packages.qmd | 6 +++--- repro_intro.qmd | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/packages.qmd b/packages.qmd index 213a4cf..9dc9dfe 100644 --- a/packages.qmd +++ b/packages.qmd @@ -729,7 +729,7 @@ Something important to notice as well: my fusen-ready `.Rmd` file is simply called `save_data.Rmd`, while the generated, inflated file, that will be part of the package under the `vignettes/` folder is called `dev-save_data.Rmd`. -When you inflate you a flat file into a package, the R console will be verbose. +When you inflate a flat file into a package, the R console will be verbose. This lists all files that are created or modified, but there is also a long list of checks that run automatically. This is the output of `devtools::check()` that is included inside `fusen::inflate()`. This function verifies that your package, @@ -847,7 +847,7 @@ It is also possible to install the package from a specific branch: ```{r, eval = F} remotes::install_github( - "github_username/repository_name@repo_name" + "github_username/repository_name@branch_name" ) ``` @@ -856,7 +856,7 @@ commit: ```{r, eval = F} remotes::install_github( - "github_username/repository_name@repo_name", + "github_username/repository_name@branch_name", ref = "commit_hash" ) ``` diff --git a/repro_intro.qmd b/repro_intro.qmd index fdb5669..aefc9ef 100644 --- a/repro_intro.qmd +++ b/repro_intro.qmd @@ -575,7 +575,7 @@ The first problem, and I’m repeating myself here, is that `{renv}` only record the R version used for the project, but does not restore it when calling `renv::restore()`. You need to install the right R version yourself. On Windows this should be fairly easy to do, but then you need to start juggling R versions -and know which scrips need which R version, which can get confusing. +and know which scripts need which R version, which can get confusing. There is the `{rig}` package that makes it easy to install and switch between R versions that you could check From dfe378307851b1f598b8f2f330c835b6698e6d7f Mon Sep 17 00:00:00 2001 From: JakeTufts <69049500+JakeTufts@users.noreply.github.com> Date: Sun, 24 Sep 2023 20:56:38 +0100 Subject: [PATCH 06/13] More small typo edits --- packages.qmd | 1 + repro_cont.qmd | 16 ++++++++-------- targets.qmd | 4 ++-- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/packages.qmd b/packages.qmd index 9dc9dfe..9aba782 100644 --- a/packages.qmd +++ b/packages.qmd @@ -690,6 +690,7 @@ is that I’ve added examples: ````{verbatim} ```{r examples-get_laspeyeres, eval = FALSE} #' \dontrun{ +#' country_level_data_laspeyeres <- get_laspeyeres_index(country_level_data) #' commune_level_data_laspeyeres <- get_laspeyeres(commune_level_data) #' } ``` diff --git a/repro_cont.qmd b/repro_cont.qmd index 4c23dd1..864ce0d 100644 --- a/repro_cont.qmd +++ b/repro_cont.qmd @@ -82,7 +82,7 @@ architecture with their Apple silicon CPUs (as of writing, the Mac Pro is the only computer manufactured by Apple that doesn't use an Apple silicon CPU and only because it was released in 2019) and it wouldn't surprise me if other manufacturers follow suit and develop their own ARM cpus. This means that -projects written today may not run anymore in the future, because of this +projects written today may not run anymore in the future, because of these architecture changes. Libraries compiled for current architectures would need to be recompiled for ARM, and that may be difficult. @@ -534,7 +534,7 @@ Google search (but I'm giving it to you, dear reader, for free). Then come `RUN` statements. The first one uses Ubuntu's package manager to first refresh the repositories (this ensures that our local Ubuntu installation -repositories are in synch with the latest software updates that were pushed to +repositories are in sync with the latest software updates that were pushed to the central Ubuntu repos). Then we use Ubuntu's package manager to install `r-base`. `r-base` is the package that installs R. We then finish this Dockerfile by running `CMD ["R"]`. This is the command that will be executed @@ -587,7 +587,7 @@ What is going on here? When you run a container, the command specified by `CMD` gets executed, and then the container quits. So here, the container ran the command `R`, which started the R interpreter, but then quit immediately. When quitting R, users should specify if they want to save or not save the workspace. -This is what the message above is telling us. So, how can be use this? Is there +This is what the message above is telling us. So, how can we use this? Is there a way to use this R version interactively? Yes, there is a way to use this R version boxed inside our Docker image @@ -694,7 +694,7 @@ as a file. I’ll explain how later. The Rocker project offers many different images, which are described [here](https://rocker-project.org/images/)^[https://rocker-project.org/images/]. We are going to be using the *versioned* images. These are images that ship -specific versions of R. This way, it doesn't matter when the image gets build, +specific versions of R. This way, it doesn't matter when the image gets built, the same version of R will be installed by getting built from source. Let me explain why building R from source is important. When we build the image from the Dockerfile we wrote before, R gets installed from the Ubuntu repositories. @@ -882,7 +882,7 @@ and final step: This runs the `R` program from the Linux command line with the option `-e`. This option allows you to pass an `R` expression to the command line, which needs to -be written between `""`. Using `R -e` will quickly become an habit, because this +be written between `""`. Using `R -e` will quickly become a habit, because this is how you can run R non-interactively, from the command line. The expression we pass sets the working directory to `/home/housing`, and then we use `renv::init()` and `renv::restore()` to restore the packages from the @@ -1086,7 +1086,7 @@ the R session in the right directory. So we move to the right directory, then we run the pipeline using `R -e "targets::tar_make()"`. Notice that we do both operations within a `RUN` statement. This means that the pipeline will run at build-time (remember, `RUN` statements run at build-time, `CMD` statements at -run-time). In order words, the image will contain the outputs. This way, if the +run-time). In other words, the image will contain the outputs. This way, if the build process and the pipeline take a long time to run, you can simply leave them running overnight for example. In the morning, while sipping on your coffee, you can then simply run the container to instantly get the outputs. This @@ -1320,7 +1320,7 @@ By following these two rules, you should keep any issues to a minimum. When or if you need to update R and/or the package library on your machine, simply create a new Docker image that reflects these changes. -However, if work in a field where operating system versions matter, then yes, +However, if you work in a field where operating system versions matter, then yes, you should find a way to either use the dockerized environment for development, or you should install Ubuntu on your computer (the same version as in Docker of course). @@ -1636,7 +1636,7 @@ needs mitigation, and thus a plan B. This plan B could be to host the images yourself, by saving them using `docker save`. Or you could even self-host an image registry (or lobby your employer/institution/etc to host a registry for its developers/data scientists/researchers). In any case, it's good to have -options and now what potential risks using this technology entail. +options and know what potential risks using this technology entail. ### Is Docker enough? diff --git a/targets.qmd b/targets.qmd index 7807eb9..34d1462 100644 --- a/targets.qmd +++ b/targets.qmd @@ -926,7 +926,7 @@ This pipeline loads the `.csv` file from before and creates a summary of the data as well as plot. But we don’t simply want these objects to be saved as `.rds` files by the pipeline, we want to be able to use them to write a document (either in the `.Rmd` or `.Qmd` format). For this, we need another package, -called `{tarchetypes}`. This package comes many functions that allow you to +called `{tarchetypes}`. This package comes with many functions that allow you to define new types of targets (these functions are called *target factories* in `{targets}` jargon). The new target factory that we need is `tarchetypes::tar_render()`. As you can probably guess from the name, this @@ -1373,7 +1373,7 @@ default, `data()` loads the data in the global environment. But remember, we want our function to be pure, meaning, it should only return the data object and not load anything into the global environment! So that’s where the temporary environment created in the first line of the body of the function comes into -play. What happens is that the functions loads the data object into this +play. What happens is that the function loads the data object into this temporary environment, which is different from the global environment. Once we’re done, we can simply discard this environment, and so our global environment stays clean. From 5835b06cc9bdd39c6aaae4aef9c5139703e2dc7a Mon Sep 17 00:00:00 2001 From: Bruno Rodrigues Date: Mon, 25 Sep 2023 16:50:58 +0200 Subject: [PATCH 07/13] started updating links --- intro.qmd | 126 ++++++++++++++++++++++---------------------- project_rewrite.qmd | 6 +-- project_start.qmd | 30 +++++++---- 3 files changed, 87 insertions(+), 75 deletions(-) diff --git a/intro.qmd b/intro.qmd index b5857b1..43a7604 100644 --- a/intro.qmd +++ b/intro.qmd @@ -6,14 +6,14 @@ visualisation. The goal is to teach you a set of tools, practices and project management techniques that should make your projects easier to reproduce, replicate and retrace. These tools and techniques can be used right from the start of your -project at a minimal cost, such that once you’re done with the analysis, you’re +project at a minimal cost, such that once you're done with the analysis, you're also done with making the project reproducible. Your projects are going to be reproducible simply because they were engineered, from the start, to be reproducible. There are two main ideas in this book that you need to keep in mind at all times: -- DRY: Don’t Repeat Yourself; +- DRY: Don't Repeat Yourself; - WIT: Write IT down. DRY WIT is not only the best type of humour, it is also the best way to write @@ -24,7 +24,7 @@ reproducible analytical pipelines. This book is for anyone that uses raw data to build any type of output based on that raw data. This can be a simple quarterly report for example, in which the data is used for tables and graphs, or a scientific article for a peer reviewed -journal or even an interactive web application. It doesn’t matter, because the +journal or even an interactive web application. It doesn't matter, because the process is, at its core, always very similar: - Get the data; @@ -33,7 +33,7 @@ process is, at its core, always very similar: - Put the results into the final product. This book will already assume some familiarity with programming, and in -particular the R programming language. However, if you’re comfortable with +particular the R programming language. However, if you're comfortable with another programming language like Python, you could still learn a lot from reading this book. The tools presented in this book are specific to R, but there will always be an alternative for the language you prefer using, meaning @@ -43,12 +43,12 @@ that you could apply the advice from this book to your needs and preferences. The aim of this book is to make the process of analysing data as reliable, retraceable, and reproducible as possible, and do this by design. This means -that once you’re done with the analysis, you’re done. You don’t want to spend +that once you're done with the analysis, you're done. You don't want to spend time, which you often don't have anyways, to rewrite or refactor an analysis and make it reproducible after the fact. We both know that this is not going to -happen. Once an analysis is done, it’s time to go to the next analysis. And if +happen. Once an analysis is done, it's time to go to the next analysis. And if you need to rerun an older analysis (for example, because the data got updated), -then you’ll simply figure it out at that point, right? That's a problem for +then you'll simply figure it out at that point, right? That's a problem for future you, right? Hopefully, future you will remember every quirk of your code and know which script to run at which point in the process, which comments are outdated and can be safely ignored, what features of the data need to be checked @@ -60,8 +60,8 @@ Going forward, I'm going to refer to a project that is reproducible as a make such a RAP; either you are lucky enough to have someone on the team whose job is to turn your messy code into a RAP, or you do it yourself. And this second option is very likely the most common. The issue is, as stated above, -that most of us simply don’t do it. We are always in the rush to get to the -results, and don’t think about making the process reproducible. This is because +that most of us simply don't do it. We are always in the rush to get to the +results, and don't think about making the process reproducible. This is because we always think that making the process reproducible takes time and this time is better spent working on the analysis itself. But this is a misconception, for two reasons. @@ -74,16 +74,16 @@ techniques will even save you time (especially testing) and headaches. The second reason is that an analysis is never, ever, a one-shot. Only the most simple things, like pulling out a number from some data base may be a one-shot. -And even then, chances are that once you provide that number, you’ll be asked to +And even then, chances are that once you provide that number, you'll be asked to pull out a variation of that number (for example, by disaggregating by one or -several variables). Or maybe you’ll get asked for an update to that number in +several variables). Or maybe you'll get asked for an update to that number in six months. So you will learn very quickly to keep that SQL query in a script somewhere to make sure that you provide a number that is consistent. But what about more complex analyses? Is keeping the script enough? Keeping the script is already a good start of course. The problem is that very often, there is no script, or not a script for each step of the analysis. -I’ve seen this play out many times in many different organisations. It’s that +I've seen this play out many times in many different organisations. It's that time of the year again, we have to write a report. 10 people are involved, and just gathering the data is already complicated. Some get their data from Word documents attached to emails, some from a website, some from a report from @@ -91,11 +91,11 @@ another department that is a PDF... I remember a story that a senior manager at my previous job used to tell us: once, a client put out a call for a project that involved helping them setting up a PDF scraper. They periodically needed data from another department that came in PDFs. The manager asked what was, at -least from our perspective, an obvious question: why can’t they send you the +least from our perspective, an obvious question: why can't they send you the underlying data from that PDF in a machine readable format? They had never thought to ask. So my manager went to that department, and talked to the people putting that PDF together. Their answer? "Well, we could send them the data in -any format they want, but they’ve asked us to send the tables in a PDF format". +any format they want, but they've asked us to send the tables in a PDF format". So the first, and probably most important lesson here is: when starting to build a RAP, make sure that you talk with all the people involved. @@ -109,7 +109,7 @@ complex projects. Ideally, you should know about packages, how to install them, you should have written some functions already, know about loops and have some basic knowledge of data structures like lists. While this is not a book on visualisation, we will be making some graphs using the `{ggplot2}` package, so -if you’re familiar with that, that’s good. If not, no worries, visualisation, +if you're familiar with that, that's good. If not, no worries, visualisation, data munging or data analysis is not the point of this book. Chapter 2, *Before we start* should help you gauge how easily you will be able to follow this book. @@ -131,33 +131,33 @@ manipulation by a human somewhere in the loop. A reproducible project means that this project can be rerun by anyone at 0 (or very minimal) cost. But there are different levels of reproducibility, and I -will discuss this in the next section. Let’s first discuss some requirements +will discuss this in the next section. Let's first discuss some requirements that a project must have to be considered a RAP. ### Using open-source tools to build a RAP is a hard requirement Open source is a hard requirement for reproducibility. -No ifs nor buts. And I’m not only talking about the code you typed for your -research paper/report/analysis. I’m talking about the whole ecosystem that you +No ifs nor buts. And I'm not only talking about the code you typed for your +research paper/report/analysis. I'm talking about the whole ecosystem that you used to type your code and build the workflow. -Is your code open? That’s good. Or is it at least available to other people from +Is your code open? That's good. Or is it at least available to other people from your organisation, in a way that they could re-execute it if needed? Good. But is it code written in a proprietary program, like STATA, SAS or MATLAB? Then -your project is not reproducible. It doesn’t matter if this code is well +your project is not reproducible. It doesn't matter if this code is well documented and written and available on a version control system (internally to your company or open to the public). This project is just not reproducible. Why? Because on a long enough time horizon, there is no way to re-execute your code with the exact same version of the proprietary programming language and on the exact same version of the operating system that was used at the time the project -was developed. As I’m writing these lines, MATLAB, for example, is at version -R2022b. And buying an older version may not be simple. I’m sure if you contact +was developed. As I'm writing these lines, MATLAB, for example, is at version +R2022b. And buying an older version may not be simple. I'm sure if you contact their sales department they might be able to sell you an older version. Maybe -you can even simply re-download older versions that you’ve already bought from their -website. But maybe it’s not that simple. Or maybe they won’t offer this option +you can even simply re-download older versions that you've already bought from their +website. But maybe it's not that simple. Or maybe they won't offer this option anymore in the future, who knows? In any case, if you google "purchase old version of Matlab" you will see that many researchers and engineers have this need. @@ -177,46 +177,46 @@ knitr::include_graphics("images/matlab_old_version.png") ``` ::: -And if you’re running old code written for version, say, R2008a, there’s no +And if you're running old code written for version, say, R2008a, there's no guarantee that it will produce the exact same results on version 2022b. And -let’s not even mention the toolboxes (if you’re not familiar with MATLAB’s -toolboxes, they’re the equivalent of packages or libraries in other programming -languages). These evolve as well, and there’s no guarantee that you can purchase -older versions of said toolboxes. And it’s likely that newer versions of +let's not even mention the toolboxes (if you're not familiar with MATLAB's +toolboxes, they're the equivalent of packages or libraries in other programming +languages). These evolve as well, and there's no guarantee that you can purchase +older versions of said toolboxes. And it's likely that newer versions of toolboxes cannot even run on older versions of Matlab. -And let me be clear, what I’m describing here with MATLAB could also be said for +And let me be clear, what I'm describing here with MATLAB could also be said for any other proprietary programs still commonly (unfortunately) used in research and in statistics (like STATA, SAS or SPSS). And even if some, or even all, of the editors of these proprietary tools provide ways to buy and run older versions of their software, my point is that the fact that you have to rely on them for this is a barrier to reproducibility, and there is no guarantee they will provide the option to purchase older versions forever. Also, who guarantees -that the editors of these tools will be around forever? Or, and that’s more +that the editors of these tools will be around forever? Or, and that's more likely, that they will keep offering a program that you install on your machine instead of shifting to a subscription based model? *For just $199 a month, you can execute your SAS (or whatever) scripts on the cloud! Worry about data confidentiality? No worries, data gets encrypted and -stored safely on our secure servers! Run your analysis from anywhere and don’t +stored safely on our secure servers! Run your analysis from anywhere and don't worry about losing your work if your cat knocks over your coffee on your laptop! And if you purchase the pro licence, for an additional $100 a month, you can even execute your code in parallel!* -Think this is science fiction? Google "SAS cloud" to see SAS’s cloud based +Think this is science fiction? Google "SAS cloud" to see SAS's cloud based offering. ### There are hidden dependencies that can hinder the reproducibility of a project -Then there’s another problem: let’s suppose you’ve written a nice, thoroughly -tested and documented workflow, and made it available on Github (and let’s even +Then there's another problem: let's suppose you've written a nice, thoroughly +tested and documented workflow, and made it available on Github (and let's even assume that the data is available for people to freely download, and that the -paper is open access). Or, if you’re working in the private sector, you did +paper is open access). Or, if you're working in the private sector, you did everything above as well, the only difference being that the workflow is only available to people inside the company instead of being available freely and publicly online. -Let’s further assume that you’ve used R or Python, or any other open source +Let's further assume that you've used R or Python, or any other open source programming language. Could this study/analysis be said to be reproducible? Well, if the analysis ran on a proprietary operating system, then the conclusion is: your project is not reproducible. @@ -226,7 +226,7 @@ outputs that your pipeline builds. There are some particularities in operating systems that may make certain things work differently. Admittedly, this is in practice rarely a problem, but [it does happen](https://github.com/numpy/numpy/issues/9187)^[https://github.com/numpy/numpy/issues/9187], -especially if you’re working with very high precision floating point arithmetic +especially if you're working with very high precision floating point arithmetic like you would do in the financial sector for instance. Thankfully, there is no need to change operating systems to deal with this @@ -239,7 +239,7 @@ it has to respect the following bullet points: - Source code must obviously be available and thoroughly tested and documented (which is why we will be using Git and Github); - All the dependencies must be easy to find and install (we are going to deal with this using dependency management tools); -- To be written with an open source programming language (nocode tools like Excel are by default non-reproducible because they can’t be used non-interactively, and which is why we are going to use the R programming language); +- To be written with an open source programming language (nocode tools like Excel are by default non-reproducible because they can't be used non-interactively, and which is why we are going to use the R programming language); - The project needs to be run on an open source operating system (thankfully, we can deal with this without having to install and learn to use a new operating system, thanks to Docker); - Data and the paper/report need obviously to be accessible as well, if not publicly as is the case for research, then within your company. This means that the concept of "scripts and/or data available upon request" belongs in the trash. @@ -247,37 +247,37 @@ it has to respect the following bullet points: ::: {.content-hidden when-format="pdf"}
A real sentence from a real paper published in *THE LANCET Regional Health*. How about *make the data available and I won’t scratch your car*, how’s that for a reasonable request? -
A real sentence from a real paper published in *THE LANCET Regional Health*. How about *make the data available and I won’t scratch your car*, how’s that for a reasonable request?
+ alt="A real sentence from a real paper published in *THE LANCET Regional Health*. How about *make the data available and I won't scratch your car*, how's that for a reasonable request?"> +
A real sentence from a real paper published in *THE LANCET Regional Health*. How about *make the data available and I won't scratch your car*, how's that for a reasonable request?
::: ::: {.content-visible when-format="pdf"} ```{r, echo = F} -#| fig-cap: "A real sentence from a real paper published in *THE LANCET Regional Health*. How about *make the data available and I won’t scratch your car*, how’s that for a reasonable request?" +#| fig-cap: "A real sentence from a real paper published in *THE LANCET Regional Health*. How about *make the data available and I won't scratch your car*, how's that for a reasonable request?" knitr::include_graphics("images/reasonable_request.png") ``` ::: ## Are there different types of reproducibility? -Let’s take one step back: we live in the real world, and in the real world, +Let's take one step back: we live in the real world, and in the real world, there are some constraints that are outside of our control. These constraints can make it impossible to build a true RAP, so sometimes we need to settle for something that might not be a true RAP, but a second or even third best thing. -In what follows, let’s assume this: in the discussion below, code is tested and -documented, so let’s only discuss the code running the pipeline itself. +In what follows, let's assume this: in the discussion below, code is tested and +documented, so let's only discuss the code running the pipeline itself. The *worst* reproducible pipeline would be something that works, but only on your machine. This can be simply due to the fact that you hardcoded paths that only exist on your laptop. Anyone wanting to rerun the pipeline would need to change the paths. This is something that needs to be documented in a README -which we assumed was the case, so there’s that. But maybe this pipeline only -runs on your laptop because the computational environment that you’re using is -hard to reproduce. Maybe you use software, even if it’s open source software, +which we assumed was the case, so there's that. But maybe this pipeline only +runs on your laptop because the computational environment that you're using is +hard to reproduce. Maybe you use software, even if it's open source software, that is not easy to install (anyone that tried to install R packages on Linux -that depend on the `{rJava}` package know what I’m talking about). +that depend on the `{rJava}` package know what I'm talking about). So a least worse pipeline would be one that could be run more easily on any similar machine to yours. This could be achieved by not using hardcoded absolute @@ -298,14 +298,14 @@ The issue here is that you need to make sure that the right versions of the packages get installed. If your script uses `{ggplot2}` version 2.2.1, then users should install this version as well, and by running the script above, the latest version of `{ggplot2}` (as of writing, version 3.4.0) will get installed. -Maybe that’s not a problem, but it can be if your script uses a function from +Maybe that's not a problem, but it can be if your script uses a function from version 2.2.1 that is not available anymore in the latest version (or maybe its -name got changed, or maybe it was modified somehow and doesn’t provide the exact +name got changed, or maybe it was modified somehow and doesn't provide the exact same result). The more packages the script uses (and the older it is), the higher the likelihood that some package version will not be compatible. There is also the issue of the R version itself. Generally speaking, recent versions of R seem to not be too bad when it comes to running older code written in R. I know -this because in 2022 I’ve run every example that comes bundled with R since +this because in 2022 I've run every example that comes bundled with R since version 0.6.0 on the then current version (as of writing) of R, version 4.2.2. ::: {.content-visible when-format="pdf"} @@ -336,15 +336,15 @@ installation of R run on the current version of R (version 4.2.2 as of writing). These are the examples from the default packages like `{base}`, `{stats}`, `{stats4}`, and so on. Turns out that more than 75% of the example code from version 0.6.0 still work on the current version of R. A small fraction output a -message (which doesn’t mean the code doesn’t work), some 5% raise a warning, -which again doesn’t necessarily mean that the code doesn’t work, and finally +message (which doesn't mean the code doesn't work), some 5% raise a warning, +which again doesn't necessarily mean that the code doesn't work, and finally around 20% or so produce errors. As you can see, the closer we get to the current release, the fewer errors get raised (if you want to run the code for yourself, check out this [Github repository](https://github.com/b-rodrigues/code_longevity)^[https://github.com/b-rodrigues/code_longevity]). (But something important should be noted: just because some old piece of code -runs without error, doesn’t mean that the result is exactly the same. There +runs without error, doesn't mean that the result is exactly the same. There might be cases where the same function returns different results on different versions of R.) @@ -363,7 +363,7 @@ up the code in a way that it actually is reproducible. So what does this all mean? This means that reproducibility is on a continuum, and depending on the constraints you face your project can be "not very -reproducible" to "totally reproducible". Let’s consider the following list of +reproducible" to "totally reproducible". Let's consider the following list of anything that can influence how reproducible your project truly is: - Version of the programming language used; @@ -383,14 +383,14 @@ reintroduce the idea and call it the "reproducibility iceberg". ::: {.content-hidden when-format="pdf"}
The reproducibility spectrum from Peng’s 2011 paper. -
The reproducibility spectrum from Peng’s 2011 paper.
+ alt="The reproducibility spectrum from Peng's 2011 paper."> +
The reproducibility spectrum from Peng's 2011 paper.
::: ::: {.content-visible when-format="pdf"} ```{r, echo = F} -#| fig-cap: "The reproducibility spectrum from Peng’s 2011 paper." +#| fig-cap: "The reproducibility spectrum from Peng's 2011 paper." knitr::include_graphics("images/repro_spectrum.png") ``` ::: @@ -409,14 +409,14 @@ first. You cannot even install older versions of R, unless you also compile those from source! Now I have read about a compatibility layer called Rosetta which enables to run binaries compiled for the Intel architecture on the ARM architecture, and maybe this works well with R and CRAN binaries compiled for -Intel architecture. Maybe, I don’t know. But my point is that you never know +Intel architecture. Maybe, I don't know. But my point is that you never know what might come in the future, and thus needing to be able to compile from source is important, because compiling from source is what requires the least amount of dependencies that are outside of your control. Relying on binaries is not future-proof (and which is again, another reason why open-source tools are a hard requirement for reproducibility). -And for you Windows users, don’t think that the preceding paragraph does not +And for you Windows users, don't think that the preceding paragraph does not concern you. I think that it is very likely that Microsoft will push in the future for OEM manufacturers to build more ARM based computers. There is already an ARM version of Windows after all, and it has been around for quite some time, @@ -434,7 +434,7 @@ more cloud based computing, but I think that this scenario is less likely than the one from before. But who knows. And in that case it is quite likely that the actual code will be running on Linux servers that will likely be ARM based because of energy and licensing costs. Here again, if you want to run your -historical code, you’ll have to compile old packages and R versions from source. +historical code, you'll have to compile old packages and R versions from source. Ok, so this might seem all incredibly complicated. How on earth are we supposed to manage all these risks and balance the immediate need for results with the @@ -448,4 +448,4 @@ way will also ensure that you avoid mistakes and producing results that are wrong. It will be easier and faster to iterate and improve your code, to collaborate, and ultimately to trust the results of your pipelines. So even if no one will rerun that code ever again, you will still benefit from the best -practices presented in this book. Let’s dive in! +practices presented in this book. Let's dive in! diff --git a/project_rewrite.qmd b/project_rewrite.qmd index aead2da..c0fa6d7 100644 --- a/project_rewrite.qmd +++ b/project_rewrite.qmd @@ -329,7 +329,7 @@ We’re now scraping data from Wikipedia of former Luxembourguish communes: ```{r} get_former_communes <- function( - url = "https://w.wiki/_wFe7", + url = "https://is.gd/lux_former_communes", min_year = 2009, table_position = 3 ){ @@ -351,8 +351,8 @@ We can scrape current communes: ```{r} get_current_communes <- function( - url = "https://w.wiki/6nPu", - table_position = 1 + url = "https://is.gd/lux_communes", + table_position = 2 ){ read_html(url) %>% diff --git a/project_start.qmd b/project_start.qmd index af99df5..b2d8f95 100644 --- a/project_start.qmd +++ b/project_start.qmd @@ -383,23 +383,34 @@ there. For this, we need a list of communes from Luxembourg. [Thankfully, Wikipedia has such a list](https://en.wikipedia.org/wiki/List_of_communes_of_Luxembourg)^[https://w.wiki/6nPu]. -Let's scrape and save this list: +An issue with scraping tables off the web is that they might change in the +future. It is therefore a good idea to save the page by right clicking on it and +then selecting save as, and then re-hosting it. I use Github pages to re-host +the Wikipedia page above [here](https://b-rodrigues.github.io/list_communes/). I +now have full control of this page, and won't get any bad surprises if someone +decides to eventually update it. Instead of re-hosting it, you could simply save +it as any other file of your project. + +So let's scrape and save this list: ```{r} -current_communes <- "https://w.wiki/6nPu" |> +current_communes <- "https://is.gd/lux_communes" |> rvest::read_html() |> rvest::html_table() |> - purrr::pluck(1) |> - janitor::clean_names() + purrr::pluck(2) |> + janitor::clean_names() |> + dplyr::filter(name_2 != "Name") |> + dplyr::rename(commune = name_2) ``` -We scrape the table from the Wikipedia page using `{rvest}`. +We scrape the table from the re-hosted Wikipedia page using `{rvest}`. `rvest::html_table()` returns a list of tables from the Wikipedia table, and -then we use `purrr::pluck()` to keep the first table from the website, which is +then we use `purrr::pluck()` to keep the second table from the website, which is what we need (I made the calls to the packages explicit, because you might not be familiar with these packages). `janitor::clean_names()` transforms column names written for human eyes into machine-friendly names (for example `Growth -rate in %` would be transformed to `growth_rate_in_percent`). +rate in %` would be transformed to `growth_rate_in_percent`) and then I use +the `{dplyr}` package for some further cleaning and renaming. Let’s see if we have all the communes in our data: @@ -415,10 +426,11 @@ there’s also a less obvious reason; since 2010, several communes have merged into new ones. So there are communes that are in our data in 2010 and 2011, but disappear from 2012 onwards. So we need to do several things: first, get a list of all existing communes from 2010 onwards, and then, harmonise -spelling. Here again, we can use a list from Wikipedia: +spelling. Here again, we can use a list from Wikipedia, and here again, I decide +to re-host it on Github pages to avoid problems in the future: ```{r} -former_communes <- "https://w.wiki/_wFe7" |> +former_communes <- "https://is.gd/lux_former_communes" |> rvest::read_html() |> rvest::html_table() |> purrr::pluck(3) |> From d3f8d8bde82a736678225b67eb9d288d38c5a7d4 Mon Sep 17 00:00:00 2001 From: Bruno Rodrigues Date: Mon, 2 Oct 2023 16:22:19 +0200 Subject: [PATCH 08/13] updated save_data.R --- scripts/analysis.R | 10 +++++----- scripts/save_data.R | 12 +++++++----- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/scripts/analysis.R b/scripts/analysis.R index 1023285..dee489f 100644 --- a/scripts/analysis.R +++ b/scripts/analysis.R @@ -3,12 +3,12 @@ library(ggplot2) library(purrr) library(tidyr) -#Let’s load the datasets: +#Let's load the datasets: commune_level_data <- read.csv("datasets/commune_level_data.csv") country_level_data <- read.csv("datasets/country_level_data.csv") -#Let’s compute the Laspeyeres index for each commune: +#Let's compute the Laspeyeres index for each commune: commune_level_data <- commune_level_data %>% group_by(locality) %>% @@ -21,7 +21,7 @@ commune_level_data <- commune_level_data %>% pl_m2 = average_price_m2_nominal_euros/p0_m2*100) -#Let’s also compute it for the whole country: +#Let's also compute it for the whole country: country_level_data <- country_level_data %>% mutate(p0 = ifelse(year == "2010", average_price_nominal_euros, NA)) %>% @@ -33,7 +33,7 @@ country_level_data <- country_level_data %>% #We are going to create a plot for 5 communes and compare the price evolution in the communes -#to the national price evolution. Let’s first list the communes: +#to the national price evolution. Let's first list the communes: communes <- c("Luxembourg", "Esch-sur-Alzette", @@ -122,7 +122,7 @@ wincrange_plot <- ggplot(data_to_plot) + group = locality, colour = locality)) -# Let’s save the plots +# Let's save the plots ggsave("plots/lux_plot.pdf", lux_plot) ggsave("plots/esch_plot.pdf", esch_plot) ggsave("plots/mamer_plot.pdf", mamer_plot) diff --git a/scripts/save_data.R b/scripts/save_data.R index cb0034e..478f073 100644 --- a/scripts/save_data.R +++ b/scripts/save_data.R @@ -39,7 +39,7 @@ raw_data <- raw_data |> str(raw_data) -# Let’s take a look at the spelling +# Let's take a look at the spelling raw_data |> dplyr::filter(grepl("Luxembourg", locality)) |> dplyr::count(locality) @@ -91,17 +91,19 @@ country_level_data <- full_join(country_level, offers_country) |> # We need to check if communes are all in our data -current_communes <- "https://en.wikipedia.org/wiki/List_of_communes_of_Luxembourg" |> +current_communes <- "https://is.gd/lux_communes" |> rvest::read_html() |> rvest::html_table() |> - purrr::pluck(1) |> - janitor::clean_names() + purrr::pluck(2) |> + janitor::clean_names() |> + dplyr::filter(name_2 != "Name") |> + dplyr::rename(commune = name_2) # Test if all communes are there setdiff(unique(commune_level_data$locality), current_communes$commune) # We need former communes -former_communes <- "https://en.wikipedia.org/wiki/Communes_of_Luxembourg#Former_communes" |> +former_communes <- "https://is.gd/lux_former_communes" |> rvest::read_html() |> rvest::html_table() |> purrr::pluck(3) |> From 355c733a74fe2181e1937e6c4ab22bf377adbaee Mon Sep 17 00:00:00 2001 From: Bruno Rodrigues Date: Mon, 2 Oct 2023 16:54:08 +0200 Subject: [PATCH 09/13] update to save_data.R and Rmd --- project_rewrite.qmd | 15 ++++++++++----- scripts/save_data.R | 7 ++++--- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/project_rewrite.qmd b/project_rewrite.qmd index c0fa6d7..a8fda89 100644 --- a/project_rewrite.qmd +++ b/project_rewrite.qmd @@ -220,6 +220,7 @@ read_clean <- function(..., sheet){ ) |> mutate(locality = str_trim(locality)) |> select(year, locality, n_offers, starts_with("average")) +} ``` @@ -355,10 +356,14 @@ get_current_communes <- function( table_position = 2 ){ - read_html(url) %>% - html_table() %>% - pluck(table_position) %>% - clean_names() + read_html(url) |> + html_table() |> + pluck(table_position) |> + clean_names() |> + filter(name_2 != "Name") |> + rename(commune = name_2) |> + mutate(commune = str_remove(commune, " .$")) + } ``` @@ -388,7 +393,7 @@ get_test_communes <- function(former_communes, current_communes){ communes[which(communes == "Clemency")] <- "Clémency" communes[which(communes == "Redange")] <- "Redange-sur-Attert" communes[which(communes == "Erpeldange-sur-Sûre")] <- "Erpeldange" - communes[which(communes == "Luxembourg-City")] <- "Luxembourg" + communes[which(communes == "Luxembourg City")] <- "Luxembourg" communes[which(communes == "Käerjeng")] <- "Kaerjeng" communes[which(communes == "Petange")] <- "Pétange" diff --git a/scripts/save_data.R b/scripts/save_data.R index 478f073..681e84c 100644 --- a/scripts/save_data.R +++ b/scripts/save_data.R @@ -97,7 +97,8 @@ current_communes <- "https://is.gd/lux_communes" |> purrr::pluck(2) |> janitor::clean_names() |> dplyr::filter(name_2 != "Name") |> - dplyr::rename(commune = name_2) + dplyr::rename(commune = name_2) |> + dplyr::mutate(commune = stringr::str_remove(commune, " .$")) # Test if all communes are there setdiff(unique(commune_level_data$locality), current_communes$commune) @@ -121,12 +122,12 @@ communes <- unique(c(former_communes$name, current_communes$commune)) communes[which(communes == "Clemency")] <- "Clémency" communes[which(communes == "Redange")] <- "Redange-sur-Attert" communes[which(communes == "Erpeldange-sur-Sûre")] <- "Erpeldange" -communes[which(communes == "Luxembourg-City")] <- "Luxembourg" +communes[which(communes == "Luxembourg City")] <- "Luxembourg" communes[which(communes == "Käerjeng")] <- "Kaerjeng" communes[which(communes == "Petange")] <- "Pétange" -# Test if this set is empty, if yes, we’re good +# Test if this set is empty, if yes, we're good setdiff(unique(commune_level_data$locality), communes) # save the data (uncomment if you need to save) From 13f97f89d1790f856a72073fdbf3ca28a3542d25 Mon Sep 17 00:00:00 2001 From: Bruno Rodrigues Date: Mon, 2 Oct 2023 17:04:28 +0200 Subject: [PATCH 10/13] updated save_data.Rmd --- rmds/save_data.Rmd | 30 ++- rmds/save_data.html | 399 ++++++++++++++++++++++++++++++--------- rmds/save_data_fusen.Rmd | 276 +++++++++++++++++++++++++++ 3 files changed, 601 insertions(+), 104 deletions(-) create mode 100644 rmds/save_data_fusen.Rmd diff --git a/rmds/save_data.Rmd b/rmds/save_data.Rmd index e1668e9..132606a 100644 --- a/rmds/save_data.Rmd +++ b/rmds/save_data.Rmd @@ -95,9 +95,12 @@ were mergers in 2011, 2015 and 2018. So we need to account for these localities. We’re now scraping data from wikipedia of former Luxembourguish communes: ```{r} -get_former_communes <- function(url = "https://en.wikipedia.org/wiki/Communes_of_Luxembourg#Former_communes", - min_year = 2009, - table_position = 3){ +get_former_communes <- function( + url = "https://is.gd/lux_former_communes", + min_year = 2009, + table_position = 3 + ){ + read_html(url) %>% html_table() %>% pluck(table_position) %>% @@ -114,12 +117,19 @@ former_communes <- get_former_communes() We can scrape current communes: ```{r} -get_current_communes <- function(url = "https://en.wikipedia.org/wiki/List_of_communes_of_Luxembourg", - table_position = 1){ - read_html(url) %>% - html_table() %>% - pluck(table_position) %>% - clean_names() +get_current_communes <- function( + url = "https://is.gd/lux_communes", + table_position = 2 + ){ + + read_html(url) |> + html_table() |> + pluck(table_position) |> + clean_names() |> + filter(name_2 != "Name") |> + rename(commune = name_2) |> + mutate(commune = str_remove(commune, " .$")) + } ``` @@ -141,7 +151,7 @@ get_test_communes <- function(former_communes, current_communes){ communes[which(communes == "Clemency")] <- "Clémency" communes[which(communes == "Redange")] <- "Redange-sur-Attert" communes[which(communes == "Erpeldange-sur-Sûre")] <- "Erpeldange" - communes[which(communes == "Luxembourg-City")] <- "Luxembourg" + communes[which(communes == "Luxembourg City")] <- "Luxembourg" communes[which(communes == "Käerjeng")] <- "Kaerjeng" communes[which(communes == "Petange")] <- "Pétange" diff --git a/rmds/save_data.html b/rmds/save_data.html index c63c934..18e6ed7 100644 --- a/rmds/save_data.html +++ b/rmds/save_data.html @@ -11,38 +11,232 @@ - + Nominal house prices data in Luxembourg - Data cleaning - - + + - - - - + + + + - - +h1.title {font-size: 38px;} +h2 {font-size: 30px;} +h3 {font-size: 24px;} +h4 {font-size: 18px;} +h5 {font-size: 16px;} +h6 {font-size: 12px;} +code {color: inherit; background-color: rgba(0, 0, 0, 0.04);} +pre:not([class]) { background-color: white } + + +code{white-space: pre-wrap;} +span.smallcaps{font-variant: small-caps;} +span.underline{text-decoration: underline;} +div.column{display: inline-block; vertical-align: top; width: 50%;} +div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;} +ul.task-list{list-style: none;} +