updated save_data.Rmd

b-rodrigues · Oct 2, 2023 · 13f97f8 · 13f97f8
1 parent 355c733
commit 13f97f8
Show file tree

Hide file tree

Showing 3 changed files with 601 additions and 104 deletions.
diff --git a/rmds/save_data.Rmd b/rmds/save_data.Rmd
@@ -95,9 +95,12 @@ were mergers in 2011, 2015 and 2018. So we need to account for these localities.
 We’re now scraping data from wikipedia of former Luxembourguish communes:
 
 ```{r}
-get_former_communes <- function(url = "https://en.wikipedia.org/wiki/Communes_of_Luxembourg#Former_communes",
-                                min_year = 2009,
-                                table_position = 3){
+get_former_communes <- function(
+            url = "https://is.gd/lux_former_communes",
+            min_year = 2009,
+            table_position = 3
+            ){
+
   read_html(url) %>%
     html_table() %>%
     pluck(table_position) %>%
@@ -114,12 +117,19 @@ former_communes <- get_former_communes()
 We can scrape current communes:
 
 ```{r}
-get_current_communes <- function(url = "https://en.wikipedia.org/wiki/List_of_communes_of_Luxembourg",
-                                 table_position = 1){
-  read_html(url) %>%
-    html_table() %>%
-    pluck(table_position) %>%
-    clean_names()
+get_current_communes <- function(
+                 url = "https://is.gd/lux_communes",
+                 table_position = 2
+                 ){
+
+  read_html(url) |>
+    html_table() |>
+    pluck(table_position) |>
+    clean_names() |>
+    filter(name_2 != "Name") |>
+    rename(commune = name_2) |>
+    mutate(commune = str_remove(commune, " .$"))
+
 }
 
 ```
@@ -141,7 +151,7 @@ get_test_communes <- function(former_communes, current_communes){
   communes[which(communes == "Clemency")] <- "Clémency"
   communes[which(communes == "Redange")] <- "Redange-sur-Attert"
   communes[which(communes == "Erpeldange-sur-Sûre")] <- "Erpeldange"
-  communes[which(communes == "Luxembourg-City")] <- "Luxembourg"
+  communes[which(communes == "Luxembourg City")] <- "Luxembourg"
   communes[which(communes == "Käerjeng")] <- "Kaerjeng"
   communes[which(communes == "Petange")] <- "Pétange"
 

diff --git a/rmds/save_data.html b/rmds/save_data.html
diff --git a/rmds/save_data_fusen.Rmd b/rmds/save_data_fusen.Rmd
@@ -0,0 +1,276 @@
+---
+title: "Nominal house prices data in Luxembourg - Data cleaning"
+author: "Bruno Rodrigues"
+date: "`r Sys.Date()`"
+---
+
+```{r, warning=FALSE, message=FALSE}
+library(dplyr)
+library(ggplot2)
+library(janitor)
+library(purrr)
+library(readxl)
+library(rvest)
+library(stringr)
+```
+
+## Downloading the data
+
+This data is downloaded from the luxembourguish [Open Data
+Portal](https://data.public.lu/fr/datasets/prix-annonces-des-logements-par-commune/)
+(the data set called *Série rétrospective des prix annoncés des maisons par commune, de 2010 à 2021*), and the original data is from the "Observatoire de l'habitat". This data
+contains prices for houses sold since 2010 for each luxembourguish commune.
+
+The function below uses the permanent URL from the Open Data Portal to access the data,
+but I have also rehosted the data, and use my link to download the data (for archival
+purposes):
+
+```{r}
+get_raw_data <- function(url = "https://data.public.lu/fr/datasets/r/14b0156e-ff87-4a36-a867-933fc9a6f903"){
+
+  raw_data <- tempfile(fileext = ".xlsx")
+
+  download.file(url,
+                raw_data,
+                mode = "wb") # for compatibility with Windows
+
+  sheets <- excel_sheets(raw_data)
+
+  read_clean <- function(..., sheet){
+    read_excel(..., sheet = sheet) %>%
+      mutate(year = sheet)
+  }
+
+  raw_data <- map_dfr(sheets,
+                      ~read_clean(raw_data,
+                                  skip = 10,
+                                  sheet = .)) %>%
+    clean_names()
+
+  raw_data %>%
+    rename(locality = commune,
+           n_offers = nombre_doffres,
+           average_price_nominal_euros = prix_moyen_annonce_en_courant,
+           average_price_m2_nominal_euros = prix_moyen_annonce_au_m2_en_courant,
+           average_price_m2_nominal_euros = prix_moyen_annonce_au_m2_en_courant
+           ) %>%
+    mutate(locality = str_trim(locality)) %>%
+    select(year, locality, n_offers, starts_with("average"))
+
+}
+
+```
+
+```{r}
+raw_data <- get_raw_data(url = "https://github.com/b-rodrigues/rap4all/raw/master/datasets/vente-maison-2010-2021.xlsx")
+```
+
+We need clean the data: "Luxembourg" is "Luxembourg-ville" in 2010 and 2011,
+then "Luxembourg". "Pétange" is also spelled non-consistently, and we also need
+to convert columns to right type. We also directly remove rows where the
+locality contains information on the "Source":
+
+```{r}
+clean_raw_data <- function(raw_data){
+  raw_data %>%
+    mutate(locality = ifelse(grepl("Luxembourg-Ville", locality),
+                             "Luxembourg",
+                             locality),
+           locality = ifelse(grepl("P.tange", locality),
+                             "Pétange",
+                             locality)
+           ) %>%
+    filter(!grepl("Source", locality)) %>%
+    mutate(across(starts_with("average"), as.numeric))
+}
+```
+
+```{r}
+flat_data <- clean_raw_data(raw_data)
+```
+
+We now need to make sure that we got all the communes/localities in there. There
+were mergers in 2011, 2015 and 2018. So we need to account for these localities.
+
+We’re now scraping data from wikipedia of former Luxembourguish communes:
+
+```{r}
+get_former_communes <- function(
+            url = "https://is.gd/lux_former_communes",
+            min_year = 2009,
+            table_position = 3
+            ){
+
+  read_html(url) %>%
+    html_table() %>%
+    pluck(table_position) %>%
+    clean_names() %>%
+    filter(year_dissolved > min_year)
+}
+
+```
+
+```{r}
+former_communes <- get_former_communes()
+```
+
+We can scrape current communes:
+
+```{r}
+get_current_communes <- function(
+                 url = "https://is.gd/lux_communes",
+                 table_position = 2
+                 ){
+
+  read_html(url) |>
+    html_table() |>
+    pluck(table_position) |>
+    clean_names() |>
+    filter(name_2 != "Name") |>
+    rename(commune = name_2) |>
+    mutate(commune = str_remove(commune, " .$"))
+
+}
+
+```
+
+```{r}
+current_communes <- get_current_communes()
+```
+
+Let’s now create a list of all communes:
+
+```{r}
+get_test_communes <- function(former_communes, current_communes){
+
+  communes <- unique(c(former_communes$name, current_communes$commune))
+  # we need to rename some communes
+
+  # Different spelling of these communes between wikipedia and the data
+
+  communes[which(communes == "Clemency")] <- "Clémency"
+  communes[which(communes == "Redange")] <- "Redange-sur-Attert"
+  communes[which(communes == "Erpeldange-sur-Sûre")] <- "Erpeldange"
+  communes[which(communes == "Luxembourg City")] <- "Luxembourg"
+  communes[which(communes == "Käerjeng")] <- "Kaerjeng"
+  communes[which(communes == "Petange")] <- "Pétange"
+
+  communes
+}
+
+```
+
+```{r}
+former_communes <- get_former_communes()
+current_communes <- get_current_communes()
+
+communes <- get_test_communes(former_communes, current_communes)
+```
+
+Let’s test to see if all the communes from our dataset are represented.
+
+```{r}
+setdiff(flat_data$locality, communes)
+```
+
+If the above code doesn’t show any communes, then this means that we are
+accounting for every commune.
+
+Let’s keep the national average in another dataset:
+
+```{r}
+make_country_level_data <- function(flat_data){
+  country_level <- flat_data %>%
+    filter(grepl("nationale", locality)) %>%
+    select(-n_offers)
+
+  offers_country <- flat_data %>%
+    filter(grepl("Total d.offres", locality)) %>%
+    select(year, n_offers)
+
+  full_join(country_level, offers_country) %>%
+    select(year, locality, n_offers, everything()) %>%
+    mutate(locality = "Grand-Duchy of Luxembourg")
+
+}
+
+```
+
+```{r}
+country_level_data <- make_country_level_data(flat_data)
+```
+
+We can finish cleaning the commune data:
+
+```{r}
+make_commune_level_data <- function(flat_data){
+  flat_data %>%
+    filter(!grepl("nationale|offres", locality),
+           !is.na(locality))
+}
+
+```
+
+```{r}
+commune_level_data <- make_commune_level_data(flat_data)
+```
+
+We now save the dataset in a folder for further analysis (keep chunk option to
+`eval = FALSE` to avoid running it when knitting):
+
+```{r, eval = FALSE}
+write.csv(commune_level_data,
+          "datasets/house_prices_commune_level_data.csv",
+          row.names = FALSE)
+write.csv(country_level_data,
+          "datasets/house_prices_country_level_data.csv",
+          row.names = FALSE)
+```
+
+## Functions used for analysis
+
+The following function compute the Laspeyeres prices index:
+
+```{r}
+get_laspeyeres <- function(dataset){
+
+  which_dataset <- deparse(substitute(dataset))
+
+  group_var <- if(grepl("commune", which_dataset)){
+                 quo(locality)
+               } else {
+                 NULL
+               }
+  dataset %>%
+    group_by(!!group_var) %>%
+    mutate(p0 = ifelse(year == "2010", average_price_nominal_euros, NA)) %>%
+    fill(p0, .direction = "down") %>%
+    mutate(p0_m2 = ifelse(year == "2010", average_price_m2_nominal_euros, NA)) %>%
+    fill(p0_m2, .direction = "down") %>%
+    ungroup() %>%
+    mutate(pl = average_price_nominal_euros/p0*100,
+           pl_m2 = average_price_m2_nominal_euros/p0_m2*100)
+
+}
+```
+
+and this function plots the data:
+
+```{r}
+make_plot <- function(commune){
+
+  commune_data <- commune_level_data %>%
+    filter(locality == commune)
+
+  data_to_plot <- bind_rows(
+    country_level_data,
+    commune_data
+  )
+
+  ggplot(data_to_plot) +
+    geom_line(aes(y = pl_m2,
+                  x = year,
+                  group = locality,
+                  colour = locality))
+}
+```