TIP Comparison Text Analysis.Rmd

---
title: "TIP Comparison Text Analysis"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)

library(RODBC)
library(openxlsx)
library(tidyverse)
library(compareDF) # for comparing 2 df's with same columns
library(htmltools)
library(formattable) # highlighting table
library(htmlwidgets) # saving formatted table
library(tm) # for stop words
```

## Read in 2019 draft and 2019 final data.

**Note that in order to use the RODBC package you must have a 32-bit driver installed, and you must be using the 32-bit version of R/RStudio.**

* To check what version of R you're using: Tools > Global Options.

The R Version should read [Default][32-bit] (unless you're using a non-default 32-bit verison).  All packages will need to be re-installed with the 32-bit R.  Once packages are installed, one may switch back and forth between versions as preferred.

```{r}
#attempt <- RODBC::odbcConnect("TIP_2019_Final")

# Draft database data
draft_19_path <- "N:\\MTS\\Working\\TABTAC\\TIP\\DataComparisonProject\\2019Draft.accdb"
draft_19_conn <- odbcConnectAccess2007(draft_19_path)
sqlTables(draft_19_conn, tableType = "TABLE")$TABLE_NAME # Check available tables
draft_19 <- RODBC::sqlQuery(draft_19_conn, 'select * from AllData')

# Final database data
final_19_path <- "N:\\MTS\\Working\\TABTAC\\TIP\\DataComparisonProject\\2019_Final.accdb"
final_19_conn <- odbcConnectAccess2007(final_19_path)
sqlTables(final_19_conn, tableType = "TABLE")$TABLE_NAME # Check available tables
final_19 <- RODBC::sqlQuery(final_19_conn, 'select * from AllData')

odbcCloseAll()
```

## To start, take a look at only projects that show up in **both** draft TIP and final TIP in 2019 (ie common to both datasets).

```{r}
vars <- c("Projnum", "Description", "Project Total")

draft_final_19_full <- inner_join(draft_19, final_19, by = "Projnum")

# Alphabetize dataframe for look-through                                  
comparison_full_19 <- draft_final_19_full %>%
  select(noquote(order(colnames(draft_final_19_full))))

# Select variables needed for now
draft_19_pared <- draft_19 %>%
  dplyr::select(!!vars) %>%
  rename(Draft_desc = Description,
         Draft_total = `Project Total`)

final_19_pared <- final_19 %>%
  dplyr::select(!!vars) %>%
  rename(Final_desc = Description,
         Final_total = `Project Total`)

draft_final_19 <- inner_join(draft_19_pared, final_19_pared, by = "Projnum")

# Remove perfect matches
differing <- draft_final_19 %>%
  mutate(Desc_matches_exactly = ifelse(as.character(Draft_desc) == as.character(Final_desc), 1, 0),
         Total_matches_exactly = ifelse(Draft_total == Final_total, 1, 0),
         Perfect_match = ifelse(Desc_matches_exactly == 1 & Total_matches_exactly == 1, 1, 0)) %>%
  filter(Perfect_match == 0)

# Check how many differences
differing %>%
  group_by(Desc_matches_exactly, Total_matches_exactly) %>%
  count()

# There are more differences with totals than with descriptions; check out differences in totals first

```

## Look at projects where there's no difference in text description from draft to final, but there *is* a difference in project totals.  What other columns differ between the draft and final (ie why did the total change)?

```{r}
# Look at differences with totals, but same description; what's the percent difference in total?  If it's minimal, may not be worth pursuing
totdiff_descmatch <- differing %>%
  filter(Desc_matches_exactly == 1 & Total_matches_exactly == 0) %>%
  select(-Desc_matches_exactly, -Total_matches_exactly, -Perfect_match, -Final_desc, -Draft_desc) %>%
  mutate(Difference_in_totals_gross = Final_total-Draft_total,
         Difference_in_total_percent = (Final_total-Draft_total)/Draft_total*100)

write_csv(totdiff_descmatch, "Differing Proj Totals, Same Descs.csv")

# Some project totals differ by up to +300%.  Pursue further differences

tot_diff_ids <- totdiff_descmatch %>% select(Projnum)

tot_diff_draft19 <- inner_join(tot_diff_ids, draft_19, by = "Projnum")
tot_diff_final19 <- inner_join(tot_diff_ids, final_19, by = "Projnum")

# Find out which columns are different from draft to final; these will need to be dropped to use compare_df
draft_19_names <- names(draft_19)
final_19_names <- names(final_19)

names_19 <- list(draft = draft_19_names, final = final_19_names) # nested list of 2
unique_names <- unique(unlist(names_19)) # unique names across both draft and final
purrr::map(names_19, setdiff, x = unique_names) # where does draft differ from final and final from draft?

# Set discrepant columns
draft_remove <- setdiff(x = draft_19_names, y = final_19_names)
final_remove <- setdiff(x = final_19_names, y = draft_19_names)

# Remove discrepant columns
tot_draft19 <- tot_diff_draft19 %>%
  dplyr::select(-!!draft_remove)

tot_final19 <- tot_diff_final19 %>%
  dplyr::select(-!!final_remove)

# Compare differences between all columns for rows that have differing totals, but not differing text descriptions - note that both dataframes are included in html output as ordinal lines
ctable_tot_diff <- compare_df(tot_draft19, tot_final19, c("Projnum"), keep_unchanged_cols = F)

# Input order of tables here (used in tidying up the table)
first_table <- "Draft 2019"
second_table <- "Final 2019"

save_html(ctable_tot_diff$html_output, "Draft to Final 2019 Projects with Differing Totals.html")

# Reformat table so comparisons are side by side
diff_format <- ctable_tot_diff$comparison_df %>%
  dplyr::select(-chng_type) %>%
  group_by(Projnum) %>%
  mutate(Source = rep(c(first_table, second_table))) %>%
  ungroup() %>%
  dplyr::select(Source, Projnum, everything()) %>%
  gather(3:length(ctable_tot_diff$comparison_df), key = "Column", value = "Value") %>%
  unite(Variable, Column, Source) %>%
  spread(Variable, value = Value)

# Create matrix (called 'col_comparisons') of columns to be compared
compare <- as_tibble(colnames(diff_format %>% dplyr::select(-Projnum))) %>%
  rename(Col = value) %>%
  mutate(ID = row_number()) %>%
  mutate(compare_row = ifelse(ID %% 2 == 0, ID-1, ID+1))

col_comparisons <- full_join(compare, compare, by = c("ID" = "compare_row")) %>%
  filter(ID %% 2 != 0) %>%
  dplyr::select(Col.x, Col.y)

# Create function to find differences between pairs of columns
compare_cols <- function(comparison_row) {
var1 <- col_comparisons$Col.x[[comparison_row]]
var2 <- col_comparisons$Col.y[[comparison_row]]
Var1 <- sym(var1)
Var2 <- sym(var2)
prefix <- gsub("_.*", "", x = Var1)

compare_vars <- diff_format %>%
  select(var1, var2) %>%
  mutate(Matches = ifelse(!!Var1 == !!Var2, "Same", "Differs"))

columns <- c(var1, var2, prefix)

colnames(compare_vars) <- columns

return(compare_vars)
}

#compare_cols(7)

# Loop through compare_cols function to find differences between all columns
comparisons_num <- c(1:nrow(col_comparisons))
comparisons <- vector("list", length(comparisons_num))

for (i in seq_along(comparisons_num)) {
  comparisons[[i]] <- compare_cols(i)
  
}

# Convert output to dataframe, add project number and rearrange
comparisons_df <- do.call(cbind, comparisons)
comparisons_full <- cbind(comparisons_df, as_tibble(diff_format$Projnum))
comparisons_tidy <- comparisons_full %>%
  rename(Projnum = value) %>%
  dplyr::select(Projnum, everything())

```

## Format the comparison table with highlights.

```{r}
# Text color
color.picker <- function(z){
  if(is.na(z)){return("grey")}
  else if(z == "Differs"){return("orange")}
  else if( z == "Same"){return("grey")}
  else {return("grey")}
}

# Background color
bg.picker <- function(z){
  if(is.na(z)){return("white")}
  else if(z == "Differs"){return("yellow")}
  else if(z == "Same"){return("white")}
  else {return("white")}
}

# Assign conditional formatting
cond_color <- formatter("span",
                              style = x ~ style(display = "block",
                                                "border-radius" = "4px",
                                                "padding-right" = "4px",
                                                color = sapply(x, color.picker),
                                                "background-color" = sapply(x, bg.picker)))

# Format table with conditional formatting
formattable(comparisons_tidy, list(
            `AC_Final 2019` = formatter("span",
                                        style = AC ~ ifelse(AC == "Differs", "background-color: yellow", "background-color: red"))))

# Create 
comparison_cols <- enframe(colnames(comparisons_tidy))

cols_unique <- comparison_cols %>%
  dplyr::select(value) %>%
  filter(value != "Projnum") %>%
  separate(value, into = c("Variable", "Suffix"), sep = "_") %>%
  dplyr::select(Variable) %>%
  unique()

table_cols <- vector("list", length(cols_unique))

for (i in 1:nrow(cols_unique)) {
  table_cols[[i]] <- cond_color
  
}

names(table_cols) <- cols_unique$Variable

FT <- formattable(comparisons_tidy, table_cols)

saveWidget(as.htmlwidget(FT), "Project_totals_differ.html", title = "Projects with differing totals, but no difference in descriptions")
```

## Look at projects where there *is* a difference in text description from draft to final.  Remove stop words to eliminate insignificant changes.

```{r}

desc_differs <- differing %>%
  filter(Desc_matches_exactly == "0")

stop_words <- stopwords(kind = "en")

descriptions_tidy <- desc_differs %>%
  mutate(Draft_desc = trimws(tolower(Draft_desc)),
         Draft_desc = str_replace_all(Draft_desc, "\\*+", " "),
         Draft_desc = str_replace_all(Draft_desc, "\\-+", " "),
         Draft_desc = str_replace_all(Draft_desc,"[ ]+", " "), #get rid of runs of WS
         Final_desc = trimws(tolower(Final_desc)),
         Final_desc = str_replace_all(Final_desc, "\\*", " "),
         Final_desc = str_replace_all(Final_desc, "\\-+", " "),
         Final_desc = str_replace_all(Final_desc, "[ ]+", " "), # get rid of runs of WS
         Same_after_tidy = ifelse(as.character(Draft_desc) == as.character(Final_desc), 1, 0)) %>%
  filter(Same_after_tidy == 0) %>%
  dplyr::select(Projnum, Draft_desc, Final_desc, Draft_total, Final_total)

write_csv(descriptions_tidy, "Text descriptions differ.csv")

```