v0.3.0

refer to NEWS.md for details of this update.
sherrisherry · Dec 2, 2018 · 191ecc8 · 191ecc8
1 parent e0ef585
commit 191ecc8
Show file tree

Hide file tree

Showing 18 changed files with 181 additions and 68 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,17 +1,17 @@
 Package: cleandata
 Type: Package
-Title: To Inspect, Impute, Encode, and Partition Data; and to Keep Track of This Process
-Version: 0.2.0
+Title: To Inspect and Manipulate Data; and to Keep Track of This Process
+Version: 0.3.0
 Author: Sherry Zhao
 Maintainer: Sherry Zhao <[email protected]>
 Description: Functions to work with data frames to prepare data for further analysis.
-    The functions for imputation, encoding, and Partitioning can produce log files to keep track of data manipulation process.
+    The functions for imputation, encoding, partitioning, and other manipulation can produce log files to keep track of process.
 BugReports: https://github.com/sherrisherry/cleandata/issues
 URL: https://github.com/sherrisherry/cleandata
 Depends: R (>= 3.0.0)
 Imports: stats
-Suggests: rmarkdown, knitr
+Suggests: R.rsp
 License: MIT + file LICENSE
 Encoding: UTF-8
-VignetteBuilder: knitr
+VignetteBuilder: R.rsp
 LazyData: true
diff --git a/LICENSE b/LICENSE
@@ -1,2 +1,2 @@
-YEAR: 2018
+YEAR: 2018 - 2019
 COPYRIGHT HOLDER: Xiaoli (Sherry) Zhao
diff --git a/NAMESPACE b/NAMESPACE
@@ -1 +1 @@
-exportPattern("^inspect_.+|^encode_.+|^impute_.+|^partition_.+")
+exportPattern("^(.$|[^i].+|i[^n].*|in[^_].+)")
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,12 @@
+### v0.3.0
+* made parameter 'log' able to take value from a 'log_arg' variable in the parent environment (dynamic scoping) of a function
+  * the old way of assigning value to 'log' is still supported
+* new function:
+  * wh_dict:	Create Data Dictionary from Data Warehouse
+* moved some codes to internal functions and internal variables
+* revised documentation
+* bug fixing
+
 ### v0.2.0
 * new functions:
   1. partition_random: partitioning a dataset randomly

diff --git a/R/encoders.R b/R/encoders.R
@@ -1,5 +1,5 @@
-encode_ordinal<-function(x,order,none='',out.int=FALSE,full_print=TRUE,log=FALSE){
-  if(is.null(dim(x)))stop('data frame degraded to vector, use df[ , , drop=FALSE]')
+encode_ordinal<-function(x,order,none='',out.int=FALSE,full_print=TRUE, log = eval.parent(in_log_default)){
+  if(is.null(dim(x)))stop(in_msg1)
   if(full_print)print(summary(x))
   for(i in 1:ncol(x)){
     for(j in 1:length(order))levels(x[,i])[levels(x[,i])==order[j]] <- j
@@ -13,12 +13,12 @@ encode_ordinal<-function(x,order,none='',out.int=FALSE,full_print=TRUE,log=FALSE
       x[,i]<-as.integer(x[,i])}
     if(full_print)print(summary(x))
   }
-  if(is.list(log))log_plan1(x = x, log = log, sche.names = c(none,order), sche.codes = 0:length(order))
+  if(is.list(log))in_log1(x = x, log = log, sche.names = c(none,order), sche.codes = 0:length(order))
   return(x)
 }
 
-encode_binary<-function(x,out.int=FALSE,full_print=TRUE,log=FALSE){
-  if(is.null(dim(x)))stop('data frame degraded to vector, use df[ , , drop=FALSE]')
+encode_binary<-function(x,out.int=FALSE,full_print=TRUE, log = eval.parent(in_log_default)){
+  if(is.null(dim(x)))stop(in_msg1)
   if(full_print)print(summary(x))
   if(is.list(log)){
     map<-inspect_map(x,message=FALSE)
@@ -30,7 +30,7 @@ encode_binary<-function(x,out.int=FALSE,full_print=TRUE,log=FALSE){
       for(j in 1:ncol(y))levels(y[,j])<-c(0,1)
       x[,cols[[i]]]<-y
       cat(paste('coded',j,'cols','\n'))
-	  log_plan1(x = x, log = log, sche.names = lvs[[i]], sche.codes = c(0,1))
+	  in_log1(x = x, log = log, sche.names = lvs[[i]], sche.codes = c(0,1))
     }
     rm(y)
   }
@@ -48,8 +48,8 @@ encode_binary<-function(x,out.int=FALSE,full_print=TRUE,log=FALSE){
   return(x)
 }
 
-encode_onehot<-function(x, colname.sep = '_', drop1st=FALSE, full_print=TRUE, log=FALSE){
-  if(is.null(dim(x)))stop('data frame degraded to vector, use df[ , , drop=FALSE]')
+encode_onehot<-function(x, colname.sep = '_', drop1st=FALSE, full_print=TRUE, log = eval.parent(in_log_default)){
+  if(is.null(dim(x)))stop(in_msg1)
   if(sum(is.na(x)))warning('NAs are ignored in encoding')
   if(full_print)print(summary(x))
   cols<-colnames(x)
@@ -62,6 +62,6 @@ encode_onehot<-function(x, colname.sep = '_', drop1st=FALSE, full_print=TRUE, lo
   }
   if(full_print)print(apply(encoded,2,sum))
   if(is.list(log))
-	log_plan2(x = x, log = log, proc = 'e', method = 'Onehot', details = paste('Template of New Column Names: oldname', colname.sep, 'level; Dropping 1st Level: ', drop1st, sep=''))
+	in_log2(x = x, log = log, proc = 'e', method = 'Onehot', details = paste('Template of New Column Names: oldname', colname.sep, 'level; Dropping 1st Level: ', drop1st, sep=''))
   return(encoded)
 }
diff --git a/R/imputers.R b/R/imputers.R
@@ -1,23 +1,23 @@
 # impute NAs in factorial columns by the mode of corresponding columns
-impute_mode<-function(x,cols=colnames(x),idx=row.names(x),log=FALSE){
-  if(is.null(dim(x)))stop('data frame degraded to vector, use df[ , , drop=FALSE]')
-  for(i in 1:length(cols))x[is.na(x[,cols[i]]),cols[i]]<-names(which.max(table(x[idx,cols[i]])))
-  if(is.list(log))log_plan2(x = x, cols = cols, log = log, method = 'Mode')
+impute_mode<-function(x,cols=colnames(x),idx=row.names(x),log = eval.parent(in_log_default)){
+  if(is.null(dim(x)))stop(in_msg1)
+  for(i in 1:length(cols))x[is.na(x[,cols[i]]),cols[i]]<-in_mode(x[idx,cols[i]])
+  if(is.list(log))in_log2(x = x, cols = cols, log = log, method = 'Mode')
   return(x)
 }
 
 # impute NAs in numerical columns by the median of corresponding columns
-impute_median<-function(x,cols=colnames(x),idx=row.names(x),log=FALSE){
-  if(is.null(dim(x)))stop('data frame degraded to vector, use df[ , , drop=FALSE]')
+impute_median<-function(x,cols=colnames(x),idx=row.names(x), log = eval.parent(in_log_default)){
+  if(is.null(dim(x)))stop(in_msg1)
   for(i in 1:length(cols))x[is.na(x[,cols[i]]),cols[i]]<-stats::median(x[idx,cols[i]],na.rm = TRUE)
-  if(is.list(log))log_plan2(x = x, cols = cols, log = log, method = 'Median')
+  if(is.list(log))in_log2(x = x, cols = cols, log = log, method = 'Median')
   return(x)
 }
 
 # impute NAs in numerical columns by the mean of corresponding columns
-impute_mean<-function(x,cols=colnames(x),idx=row.names(x),log=FALSE){
-  if(is.null(dim(x)))stop('data frame degraded to vector, use df[ , , drop=FALSE]')
+impute_mean<-function(x,cols=colnames(x),idx=row.names(x), log = eval.parent(in_log_default)){
+  if(is.null(dim(x)))stop(in_msg1)
   for(i in 1:length(cols))x[is.na(x[,cols[i]]),cols[i]]<-mean(x[idx,cols[i]],na.rm = TRUE)
-  if(is.list(log))log_plan2(x = x, cols = cols, log = log, method = 'Mean')
+  if(is.list(log))in_log2(x = x, cols = cols, log = log, method = 'Mean')
   return(x)
 }
diff --git a/R/internal.R b/R/internal.R
@@ -1,11 +1,15 @@
 
+# inner variables
+in_log_default <- quote(if(any('log_arg' %in% ls())&&is.list(log_arg))log_arg)
+in_msg1 <- 'data frame degraded to vector, use df[ , , drop=FALSE]'
+
 # I'm very interested in knowing your opinion of log files.
 ## Please let me know it at https://github.com/sherrisherry/cleandata/issues
 ## Even if you think of log files as useless, please let me know.
 
 # A layout of content in log files, which lists the columns an operation affected and a table of scheme this operation used.
-log_plan1 <- function(x, log, sche.names, sche.codes){
-    sink(file=log$file,append = log$append, split = log$split) # divert output to file
+in_log1 <- function(x, log, sche.names, sche.codes){
+    do.call(sink, log) # divert output to file
     cat('Columns:\n\t')
     cat(colnames(x),sep=', ')
     cat('\nScheme:\n')
@@ -17,13 +21,16 @@ log_plan1 <- function(x, log, sche.names, sche.codes){
 }
 
 # A layout of content in log files, which lists the columns an operation affected, optionally with details of this operation.
-log_plan2 <- function(x, cols, log, proc = 'i', method, details = FALSE){
-    sink(file=log$file,append = log$append, split = log$split) # divert output to file
+in_log2 <- function(x, cols, log, proc = 'i', method, details = FALSE){
+  do.call(sink, log) # divert output to file
 	proc <- switch(proc, i='Imputed', e='Encoded', p='Partitioned')
     cat(paste('Columns ', proc, ' by ', method, ':\n\t', sep=''))
 	cols <- colnames(x[,cols, drop = FALSE])
     cat(cols, sep=', ')
     cat('\n\n')
 	if(is.character(details))cat(paste('Details:\n', details, '\n\n', sep = ''))
     sink() # divert output back to console
-}
+}
+
+# mathematical mode
+in_mode <- function(x){names(which.max(table(x)))}
diff --git a/R/others.R b/R/others.R
@@ -0,0 +1,10 @@
+
+wh_dict <- function(x, attr, value){
+  if(missing(attr) || missing(value))stop('Please supply attr and value')
+  lv <- unique(x[, attr])
+  dictionary <- data.frame(lv)
+  colnames(dictionary) <- attr
+  dictionary$Keys <- NA
+  for(i in 1:length(lv))dictionary[i, 'Keys'] <- as.character(x[x[,attr]==lv[i], value][1])
+  return(dictionary)
+}
diff --git a/R/partitioners.R b/R/partitioners.R
@@ -1,5 +1,5 @@
-partition_random <- function(x, name = 'Partition', train, val = 10^ceiling(log10(train))-train, test = TRUE, seed = FALSE, log=FALSE){
-	if(is.null(dim(x)))stop('data frame degraded to vector, use df[ , , drop=FALSE]')
+partition_random <- function(x, name = 'Partition', train, val = 10^ceiling(log10(train))-train, test = TRUE, seed = FALSE, log = eval.parent(in_log_default)){
+	if(is.null(dim(x)))stop(in_msg1)
 	nrows <- nrow(x)
 	if(nrows < 10)stop('This dataset is too small to be partitioned')
 	test <- ifelse(test, 10^ceiling(log10(train))-train-val, 0)
@@ -25,6 +25,6 @@ partition_random <- function(x, name = 'Partition', train, val = 10^ceiling(log1
 	msg <- paste('Train: ', round(train/nrows*100,2), '%, Validation: ', round(val/nrows*100,2), '%, Test: ', round(test/nrows*100,2), '%', sep='')
 	cat(paste(msg,'\n'))
 	if(is.list(log))
-		log_plan2(x = y, cols = name, log = log, proc = 'p', method = 'Random', details = msg)
+		in_log2(x = y, cols = name, log = log, proc = 'p', method = 'Random', details = msg)
 	return(y)
 }
diff --git a/README.md b/README.md
@@ -3,14 +3,20 @@
 [![CRAN status](https://www.r-pkg.org/badges/version/cleandata)](https://cran.r-project.org/package=cleandata) [![DOWNLOADSTOTAL](https://cranlogs.r-pkg.org/badges/grand-total/cleandata)](https://cranlogs.r-pkg.org/badges/grand-total/cleandata)
 
 
-A collection of functions that work with data frame to inspect, impute, encode, and partition data. The functions for imputation, encoding, and partitioning can produce log files to help you keep track of data manipulation process.
+A collection of functions that work with data frame to inspect and manipulate data; and to keep track of data manipulation by producing log files.
 
 Available on CRAN: https://cran.r-project.org/package=cleandata
 
-Demonstration: [Wrangling Ames Housing Dataset](http://rpubs.com/neilalien/rpkgcd020demo01)
+Demonstration: [Wrangling Ames Housing Dataset](https://cran.r-project.org/web/packages/cleandata/vignettes/Demo.html)
 
 *I planned to keep writing new demos and linking them in this Readme file.*
 
+## New in V0.3.0
+
+* Made parameter 'log' able to take value from a 'log_arg' variable in the parent environment (dynamic scoping) of a function
+  * The old way of assigning value to 'log' is also supported
+  * 'log' is the parameter to control producing log files
+
 ### List of Functions
 
 * Inspection
@@ -31,9 +37,12 @@ Demonstration: [Wrangling Ames Housing Dataset](http://rpubs.com/neilalien/rpkgc
 * Partitioning
   * partition_random:	Partition A Dataset Randomly
 
+* Other
+  * wh_dict:	Create Data Dictionary from Data Warehouse
+
 ## Installation
 
-You can install from [CRAN](https://cran.r-project.org/package=cleandata) (submission is scheduled to Sep 11 due to a CRAN vacation):
+You can install from [CRAN](https://cran.r-project.org/package=cleandata):
 
 ```r
 install.packages('cleandata')
@@ -43,7 +52,7 @@ Alternatively, you can download the source package from the release page of this
 
 ```r
 # place the source package in your work directory
-install.packages('cleandata_0.2.0.tar.gz', repos = NULL, type="source")
+install.packages('cleandata_0.3.0.tar.gz', repos = NULL, type="source")
 ```
 
 ## Usage
@@ -73,5 +82,7 @@ browseVignettes('cleandata')
 
 ## Update History
 
-* 07/18/2018: version 0.1.0 submitted to CRAN
-* 09/03/2018: version 0.2.0 uploaded to GitHub
+* 07/18/2018: version 0.1.0 was submitted to CRAN
+* 09/03/2018: version 0.2.0 was uploaded to GitHub
+* 09/13/2018: version 0.2.0 was submitted to CRAN
+* 11/30/2018: version 0.3.0 was submitted to CRAN
diff --git a/cran-comments.md b/cran-comments.md
@@ -37,3 +37,38 @@ R version 3.5.1 (2018-07-02)
 
 		The same warning as above. CRAN submission is schedule to Sep 11 due to a CRAN vacation.
 
+09/13/2018 --------------------------------
+
+After some email correspondence with CRAN admins, I decided to use static html vignette, which CRAN doesn't check, thus no need to include the dataset.
+
+
+### v0.3.0
+
+11/30/2018 --------------------------------
+
+winbuilder:
+Installation time in seconds: 5
+Check time in seconds: 56
+Status: 1 NOTE
+R version 3.5.1 (2018-07-02)
+* using platform: x86_64-w64-mingw32 (64-bit)
+* checking Rd line widths ... NOTE
+Rd file 'encode_onehot.Rd':
+  \usage lines wider than 90 characters:
+     encode_onehot(x, colname.sep = '_', drop1st = FALSE, full_print=TRUE, log = eval.parent(in_log_default))
+
+Rd file 'encode_ordinal.Rd':
+  \usage lines wider than 90 characters:
+     encode_ordinal(x, order, none='', out.int=FALSE, full_print=TRUE, log = eval.parent(in_log_default))
+
+Rd file 'partition_random.Rd':
+  \usage lines wider than 90 characters:
+                                     test = TRUE, seed = FALSE, log = eval.parent(in_log_default))
+
+These lines will be truncated in the PDF manual.
+
+		Shortened the lines.
+
+Installation time in seconds: 5
+Check time in seconds: 59
+Status: OK
diff --git a/man/encode_binary.Rd b/man/encode_binary.Rd
@@ -2,7 +2,7 @@
 \alias{encode_binary}
 \title{Encode Binary Data Into 0 and 1}
 \usage{
-encode_binary(x, out.int=FALSE, full_print=TRUE, log=FALSE)
+encode_binary(x, out.int=FALSE, full_print=TRUE, log = eval.parent(in_log_default))
 }
 \description{
 Encodes binary data into 0 and 1. Optionally records the result into a log file.
@@ -14,7 +14,7 @@ Encodes binary data into 0 and 1. Optionally records the result into a log file.
 
 \item{full_print}{When set to \code{FALSE}, only print minimum information.  A full output includes summary of \code{x} before and after encoding.}
 
-\item{log}{Controls log files.  If \code{FALSE}, no log file.  To produce log files, pass a list of \code{file}, \code{append}, and \code{split} as arguments for \code{sink()}.}
+\item{log}{Controls log files.  To produce log files, assign it or the \code{log_arg} variable in the parent environment (dynamic scope) a list of arguments for \code{sink()}, such as \code{file}, \code{append}, and \code{split}.}
 }
 \value{
 An encoded data frame.

diff --git a/man/encode_onehot.Rd b/man/encode_onehot.Rd
@@ -2,7 +2,8 @@
 \alias{encode_onehot}
 \title{One-Hot Encoding}
 \usage{
-encode_onehot(x, colname.sep = '_', drop1st = FALSE, full_print=TRUE, log=FALSE)
+encode_onehot(x, colname.sep = '_', drop1st = FALSE,
+    full_print=TRUE, log = eval.parent(in_log_default))
 }
 \description{
 Encodes categorical data by One-hot encoding. Optionally records the result into a log file.
@@ -16,7 +17,7 @@ Encodes categorical data by One-hot encoding. Optionally records the result into
 
 \item{full_print}{When set to \code{FALSE}, only print minimum information.  A full output includes summary of \code{x} before and after encoding.}
 
-\item{log}{Controls log files.  If \code{FALSE}, no log file.  To produce log files, pass a list of \code{file}, \code{append}, and \code{split} as arguments for \code{sink()}.}
+\item{log}{Controls log files.  To produce log files, assign it or the \code{log_arg} variable in the parent environment (dynamic scope) a list of arguments for \code{sink()}, such as \code{file}, \code{append}, and \code{split}.}
 }
 \value{
 An encoded data frame.

diff --git a/man/encode_ordinal.Rd b/man/encode_ordinal.Rd
@@ -7,7 +7,8 @@ Encode Ordinal Data Into Integers
 Encodes ordinal data into sequential integers by a given order.  Optionally records the result into a log file.
 }
 \usage{
-encode_ordinal(x, order, none='', out.int=FALSE, full_print=TRUE, log=FALSE)
+encode_ordinal(x, order, none='', out.int=FALSE,
+    full_print=TRUE, log = eval.parent(in_log_default))
 }
 \arguments{
 \item{x}{The data frame}
@@ -20,7 +21,7 @@ encode_ordinal(x, order, none='', out.int=FALSE, full_print=TRUE, log=FALSE)
 
 \item{full_print}{When set to \code{FALSE}, only print minimum information.  A full output includes summary of \code{x} before and after encoding.}
 
-\item{log}{Controls log files.  If \code{FALSE}, no log file.  To produce log files, pass a list of \code{file}, \code{append}, and \code{split} as arguments for \code{sink()}.}
+\item{log}{Controls log files.  To produce log files, assign it or the \code{log_arg} variable in the parent environment (dynamic scope) a list of arguments for \code{sink()}, such as \code{file}, \code{append}, and \code{split}.}
 }
 \value{
 An encoded data frame.

diff --git a/man/impute.Rd b/man/impute.Rd
@@ -5,11 +5,11 @@
 \alias{impute_mean}
 \title{Impute Missing Values}
 \usage{
-impute_mode(x,cols=colnames(x),idx=row.names(x),log=FALSE)
+impute_mode(x,cols=colnames(x),idx=row.names(x),log = eval.parent(in_log_default))
 
-impute_median(x,cols=colnames(x),idx=row.names(x),log=FALSE)
+impute_median(x,cols=colnames(x),idx=row.names(x),log = eval.parent(in_log_default))
 
-impute_mean(x,cols=colnames(x),idx=row.names(x),log=FALSE)
+impute_mean(x,cols=colnames(x),idx=row.names(x),log = eval.parent(in_log_default))
 }
 \description{
 \code{impute_mode}:  Impute \code{NA}s by the modes of their corresponding columns.
@@ -23,9 +23,9 @@ impute_mean(x,cols=colnames(x),idx=row.names(x),log=FALSE)
 
 \item{cols}{The index of columns of \code{x} to be imputed.}
 
-\item{idx}{The index of rows of \code{x} to be used to calculate the values to impute \code{NA}s. Use this parameter to prevent leakage.}
+\item{idx}{The index of rows of \code{x} to be used to calculate the values to impute \code{NA}s.  Use this parameter to prevent leakage.}
 
-\item{log}{Controls log files.  If \code{FALSE}, no log file.  To produce log files, pass a list of \code{file}, \code{append}, and \code{split} as arguments for \code{sink()}.}
+\item{log}{Controls log files.  To produce log files, assign it or the \code{log_arg} variable in the parent environment (dynamic scope) a list of arguments for \code{sink()}, such as \code{file}, \code{append}, and \code{split}.}
 }
 \value{
 An imputed data frame.

diff --git a/man/partition_random.Rd b/man/partition_random.Rd
@@ -2,9 +2,9 @@
 \alias{partition_random}
 \title{Partitioning A Dataset Randomly}
 \usage{
-partition_random(x, name = 'Partition', train, 
-				val = 10^ceiling(log10(train))-train, 
-				test = TRUE, seed = FALSE, log=FALSE)
+partition_random(x, name = 'Partition', train,
+    val = 10^ceiling(log10(train))-train, test = TRUE,
+		seed = FALSE, log = eval.parent(in_log_default))
 }
 \description{
 Designed to create a validation column. Optionally records the result into a log file.
@@ -22,7 +22,7 @@ Designed to create a validation column. Optionally records the result into a log
 
 \item{seed}{Whether to set a random seed.  If you want a reproducible result, pass a number to \code{seed} as the random seed.}
 
-\item{log}{Controls log files.  If \code{FALSE}, no log file.  To produce log files, pass a list of \code{file}, \code{append}, and \code{split} as arguments for \code{sink()}.}
+\item{log}{Controls log files.  To produce log files, assign it or the \code{log_arg} variable in the parent environment (dynamic scope) a list of arguments for \code{sink()}, such as \code{file}, \code{append}, and \code{split}.}
 }
 \value{
 A partitioned column.
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		exportPattern("^inspect_.+\|^encode_.+\|^impute_.+\|^partition_.+")
		exportPattern("^(.$\|[^i].+\|i[^n].*\|in[^_].+)")