Skip to content

Commit

Permalink
v0.3.0
Browse files Browse the repository at this point in the history
refer to NEWS.md for details of this update.
  • Loading branch information
sherrisherry committed Dec 2, 2018
1 parent e0ef585 commit 191ecc8
Show file tree
Hide file tree
Showing 18 changed files with 181 additions and 68 deletions.
10 changes: 5 additions & 5 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
Package: cleandata
Type: Package
Title: To Inspect, Impute, Encode, and Partition Data; and to Keep Track of This Process
Version: 0.2.0
Title: To Inspect and Manipulate Data; and to Keep Track of This Process
Version: 0.3.0
Author: Sherry Zhao
Maintainer: Sherry Zhao <[email protected]>
Description: Functions to work with data frames to prepare data for further analysis.
The functions for imputation, encoding, and Partitioning can produce log files to keep track of data manipulation process.
The functions for imputation, encoding, partitioning, and other manipulation can produce log files to keep track of process.
BugReports: https://github.com/sherrisherry/cleandata/issues
URL: https://github.com/sherrisherry/cleandata
Depends: R (>= 3.0.0)
Imports: stats
Suggests: rmarkdown, knitr
Suggests: R.rsp
License: MIT + file LICENSE
Encoding: UTF-8
VignetteBuilder: knitr
VignetteBuilder: R.rsp
LazyData: true
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
YEAR: 2018
YEAR: 2018 - 2019
COPYRIGHT HOLDER: Xiaoli (Sherry) Zhao
2 changes: 1 addition & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
@@ -1 +1 @@
exportPattern("^inspect_.+|^encode_.+|^impute_.+|^partition_.+")
exportPattern("^(.$|[^i].+|i[^n].*|in[^_].+)")
9 changes: 9 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
### v0.3.0
* made parameter 'log' able to take value from a 'log_arg' variable in the parent environment (dynamic scoping) of a function
* the old way of assigning value to 'log' is still supported
* new function:
* wh_dict: Create Data Dictionary from Data Warehouse
* moved some codes to internal functions and internal variables
* revised documentation
* bug fixing

### v0.2.0
* new functions:
1. partition_random: partitioning a dataset randomly
Expand Down
18 changes: 9 additions & 9 deletions R/encoders.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
encode_ordinal<-function(x,order,none='',out.int=FALSE,full_print=TRUE,log=FALSE){
if(is.null(dim(x)))stop('data frame degraded to vector, use df[ , , drop=FALSE]')
encode_ordinal<-function(x,order,none='',out.int=FALSE,full_print=TRUE, log = eval.parent(in_log_default)){
if(is.null(dim(x)))stop(in_msg1)
if(full_print)print(summary(x))
for(i in 1:ncol(x)){
for(j in 1:length(order))levels(x[,i])[levels(x[,i])==order[j]] <- j
Expand All @@ -13,12 +13,12 @@ encode_ordinal<-function(x,order,none='',out.int=FALSE,full_print=TRUE,log=FALSE
x[,i]<-as.integer(x[,i])}
if(full_print)print(summary(x))
}
if(is.list(log))log_plan1(x = x, log = log, sche.names = c(none,order), sche.codes = 0:length(order))
if(is.list(log))in_log1(x = x, log = log, sche.names = c(none,order), sche.codes = 0:length(order))
return(x)
}

encode_binary<-function(x,out.int=FALSE,full_print=TRUE,log=FALSE){
if(is.null(dim(x)))stop('data frame degraded to vector, use df[ , , drop=FALSE]')
encode_binary<-function(x,out.int=FALSE,full_print=TRUE, log = eval.parent(in_log_default)){
if(is.null(dim(x)))stop(in_msg1)
if(full_print)print(summary(x))
if(is.list(log)){
map<-inspect_map(x,message=FALSE)
Expand All @@ -30,7 +30,7 @@ encode_binary<-function(x,out.int=FALSE,full_print=TRUE,log=FALSE){
for(j in 1:ncol(y))levels(y[,j])<-c(0,1)
x[,cols[[i]]]<-y
cat(paste('coded',j,'cols','\n'))
log_plan1(x = x, log = log, sche.names = lvs[[i]], sche.codes = c(0,1))
in_log1(x = x, log = log, sche.names = lvs[[i]], sche.codes = c(0,1))
}
rm(y)
}
Expand All @@ -48,8 +48,8 @@ encode_binary<-function(x,out.int=FALSE,full_print=TRUE,log=FALSE){
return(x)
}

encode_onehot<-function(x, colname.sep = '_', drop1st=FALSE, full_print=TRUE, log=FALSE){
if(is.null(dim(x)))stop('data frame degraded to vector, use df[ , , drop=FALSE]')
encode_onehot<-function(x, colname.sep = '_', drop1st=FALSE, full_print=TRUE, log = eval.parent(in_log_default)){
if(is.null(dim(x)))stop(in_msg1)
if(sum(is.na(x)))warning('NAs are ignored in encoding')
if(full_print)print(summary(x))
cols<-colnames(x)
Expand All @@ -62,6 +62,6 @@ encode_onehot<-function(x, colname.sep = '_', drop1st=FALSE, full_print=TRUE, lo
}
if(full_print)print(apply(encoded,2,sum))
if(is.list(log))
log_plan2(x = x, log = log, proc = 'e', method = 'Onehot', details = paste('Template of New Column Names: oldname', colname.sep, 'level; Dropping 1st Level: ', drop1st, sep=''))
in_log2(x = x, log = log, proc = 'e', method = 'Onehot', details = paste('Template of New Column Names: oldname', colname.sep, 'level; Dropping 1st Level: ', drop1st, sep=''))
return(encoded)
}
20 changes: 10 additions & 10 deletions R/imputers.R
Original file line number Diff line number Diff line change
@@ -1,23 +1,23 @@
# impute NAs in factorial columns by the mode of corresponding columns
impute_mode<-function(x,cols=colnames(x),idx=row.names(x),log=FALSE){
if(is.null(dim(x)))stop('data frame degraded to vector, use df[ , , drop=FALSE]')
for(i in 1:length(cols))x[is.na(x[,cols[i]]),cols[i]]<-names(which.max(table(x[idx,cols[i]])))
if(is.list(log))log_plan2(x = x, cols = cols, log = log, method = 'Mode')
impute_mode<-function(x,cols=colnames(x),idx=row.names(x),log = eval.parent(in_log_default)){
if(is.null(dim(x)))stop(in_msg1)
for(i in 1:length(cols))x[is.na(x[,cols[i]]),cols[i]]<-in_mode(x[idx,cols[i]])
if(is.list(log))in_log2(x = x, cols = cols, log = log, method = 'Mode')
return(x)
}

# impute NAs in numerical columns by the median of corresponding columns
impute_median<-function(x,cols=colnames(x),idx=row.names(x),log=FALSE){
if(is.null(dim(x)))stop('data frame degraded to vector, use df[ , , drop=FALSE]')
impute_median<-function(x,cols=colnames(x),idx=row.names(x), log = eval.parent(in_log_default)){
if(is.null(dim(x)))stop(in_msg1)
for(i in 1:length(cols))x[is.na(x[,cols[i]]),cols[i]]<-stats::median(x[idx,cols[i]],na.rm = TRUE)
if(is.list(log))log_plan2(x = x, cols = cols, log = log, method = 'Median')
if(is.list(log))in_log2(x = x, cols = cols, log = log, method = 'Median')
return(x)
}

# impute NAs in numerical columns by the mean of corresponding columns
impute_mean<-function(x,cols=colnames(x),idx=row.names(x),log=FALSE){
if(is.null(dim(x)))stop('data frame degraded to vector, use df[ , , drop=FALSE]')
impute_mean<-function(x,cols=colnames(x),idx=row.names(x), log = eval.parent(in_log_default)){
if(is.null(dim(x)))stop(in_msg1)
for(i in 1:length(cols))x[is.na(x[,cols[i]]),cols[i]]<-mean(x[idx,cols[i]],na.rm = TRUE)
if(is.list(log))log_plan2(x = x, cols = cols, log = log, method = 'Mean')
if(is.list(log))in_log2(x = x, cols = cols, log = log, method = 'Mean')
return(x)
}
17 changes: 12 additions & 5 deletions R/internal.R
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@

# inner variables
in_log_default <- quote(if(any('log_arg' %in% ls())&&is.list(log_arg))log_arg)
in_msg1 <- 'data frame degraded to vector, use df[ , , drop=FALSE]'

# I'm very interested in knowing your opinion of log files.
## Please let me know it at https://github.com/sherrisherry/cleandata/issues
## Even if you think of log files as useless, please let me know.

# A layout of content in log files, which lists the columns an operation affected and a table of scheme this operation used.
log_plan1 <- function(x, log, sche.names, sche.codes){
sink(file=log$file,append = log$append, split = log$split) # divert output to file
in_log1 <- function(x, log, sche.names, sche.codes){
do.call(sink, log) # divert output to file
cat('Columns:\n\t')
cat(colnames(x),sep=', ')
cat('\nScheme:\n')
Expand All @@ -17,13 +21,16 @@ log_plan1 <- function(x, log, sche.names, sche.codes){
}

# A layout of content in log files, which lists the columns an operation affected, optionally with details of this operation.
log_plan2 <- function(x, cols, log, proc = 'i', method, details = FALSE){
sink(file=log$file,append = log$append, split = log$split) # divert output to file
in_log2 <- function(x, cols, log, proc = 'i', method, details = FALSE){
do.call(sink, log) # divert output to file
proc <- switch(proc, i='Imputed', e='Encoded', p='Partitioned')
cat(paste('Columns ', proc, ' by ', method, ':\n\t', sep=''))
cols <- colnames(x[,cols, drop = FALSE])
cat(cols, sep=', ')
cat('\n\n')
if(is.character(details))cat(paste('Details:\n', details, '\n\n', sep = ''))
sink() # divert output back to console
}
}

# mathematical mode
in_mode <- function(x){names(which.max(table(x)))}
10 changes: 10 additions & 0 deletions R/others.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@

wh_dict <- function(x, attr, value){
if(missing(attr) || missing(value))stop('Please supply attr and value')
lv <- unique(x[, attr])
dictionary <- data.frame(lv)
colnames(dictionary) <- attr
dictionary$Keys <- NA
for(i in 1:length(lv))dictionary[i, 'Keys'] <- as.character(x[x[,attr]==lv[i], value][1])
return(dictionary)
}
6 changes: 3 additions & 3 deletions R/partitioners.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
partition_random <- function(x, name = 'Partition', train, val = 10^ceiling(log10(train))-train, test = TRUE, seed = FALSE, log=FALSE){
if(is.null(dim(x)))stop('data frame degraded to vector, use df[ , , drop=FALSE]')
partition_random <- function(x, name = 'Partition', train, val = 10^ceiling(log10(train))-train, test = TRUE, seed = FALSE, log = eval.parent(in_log_default)){
if(is.null(dim(x)))stop(in_msg1)
nrows <- nrow(x)
if(nrows < 10)stop('This dataset is too small to be partitioned')
test <- ifelse(test, 10^ceiling(log10(train))-train-val, 0)
Expand All @@ -25,6 +25,6 @@ partition_random <- function(x, name = 'Partition', train, val = 10^ceiling(log1
msg <- paste('Train: ', round(train/nrows*100,2), '%, Validation: ', round(val/nrows*100,2), '%, Test: ', round(test/nrows*100,2), '%', sep='')
cat(paste(msg,'\n'))
if(is.list(log))
log_plan2(x = y, cols = name, log = log, proc = 'p', method = 'Random', details = msg)
in_log2(x = y, cols = name, log = log, proc = 'p', method = 'Random', details = msg)
return(y)
}
23 changes: 17 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,20 @@
[![CRAN status](https://www.r-pkg.org/badges/version/cleandata)](https://cran.r-project.org/package=cleandata) [![DOWNLOADSTOTAL](https://cranlogs.r-pkg.org/badges/grand-total/cleandata)](https://cranlogs.r-pkg.org/badges/grand-total/cleandata)


A collection of functions that work with data frame to inspect, impute, encode, and partition data. The functions for imputation, encoding, and partitioning can produce log files to help you keep track of data manipulation process.
A collection of functions that work with data frame to inspect and manipulate data; and to keep track of data manipulation by producing log files.

Available on CRAN: https://cran.r-project.org/package=cleandata

Demonstration: [Wrangling Ames Housing Dataset](http://rpubs.com/neilalien/rpkgcd020demo01)
Demonstration: [Wrangling Ames Housing Dataset](https://cran.r-project.org/web/packages/cleandata/vignettes/Demo.html)

*I planned to keep writing new demos and linking them in this Readme file.*

## New in V0.3.0

* Made parameter 'log' able to take value from a 'log_arg' variable in the parent environment (dynamic scoping) of a function
* The old way of assigning value to 'log' is also supported
* 'log' is the parameter to control producing log files

### List of Functions

* Inspection
Expand All @@ -31,9 +37,12 @@ Demonstration: [Wrangling Ames Housing Dataset](http://rpubs.com/neilalien/rpkgc
* Partitioning
* partition_random: Partition A Dataset Randomly

* Other
* wh_dict: Create Data Dictionary from Data Warehouse

## Installation

You can install from [CRAN](https://cran.r-project.org/package=cleandata) (submission is scheduled to Sep 11 due to a CRAN vacation):
You can install from [CRAN](https://cran.r-project.org/package=cleandata):

```r
install.packages('cleandata')
Expand All @@ -43,7 +52,7 @@ Alternatively, you can download the source package from the release page of this

```r
# place the source package in your work directory
install.packages('cleandata_0.2.0.tar.gz', repos = NULL, type="source")
install.packages('cleandata_0.3.0.tar.gz', repos = NULL, type="source")
```

## Usage
Expand Down Expand Up @@ -73,5 +82,7 @@ browseVignettes('cleandata')

## Update History

* 07/18/2018: version 0.1.0 submitted to CRAN
* 09/03/2018: version 0.2.0 uploaded to GitHub
* 07/18/2018: version 0.1.0 was submitted to CRAN
* 09/03/2018: version 0.2.0 was uploaded to GitHub
* 09/13/2018: version 0.2.0 was submitted to CRAN
* 11/30/2018: version 0.3.0 was submitted to CRAN
35 changes: 35 additions & 0 deletions cran-comments.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,38 @@ R version 3.5.1 (2018-07-02)

The same warning as above. CRAN submission is schedule to Sep 11 due to a CRAN vacation.

09/13/2018 --------------------------------

After some email correspondence with CRAN admins, I decided to use static html vignette, which CRAN doesn't check, thus no need to include the dataset.


### v0.3.0

11/30/2018 --------------------------------

winbuilder:
Installation time in seconds: 5
Check time in seconds: 56
Status: 1 NOTE
R version 3.5.1 (2018-07-02)
* using platform: x86_64-w64-mingw32 (64-bit)
* checking Rd line widths ... NOTE
Rd file 'encode_onehot.Rd':
\usage lines wider than 90 characters:
encode_onehot(x, colname.sep = '_', drop1st = FALSE, full_print=TRUE, log = eval.parent(in_log_default))

Rd file 'encode_ordinal.Rd':
\usage lines wider than 90 characters:
encode_ordinal(x, order, none='', out.int=FALSE, full_print=TRUE, log = eval.parent(in_log_default))

Rd file 'partition_random.Rd':
\usage lines wider than 90 characters:
test = TRUE, seed = FALSE, log = eval.parent(in_log_default))

These lines will be truncated in the PDF manual.

Shortened the lines.

Installation time in seconds: 5
Check time in seconds: 59
Status: OK
4 changes: 2 additions & 2 deletions man/encode_binary.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
\alias{encode_binary}
\title{Encode Binary Data Into 0 and 1}
\usage{
encode_binary(x, out.int=FALSE, full_print=TRUE, log=FALSE)
encode_binary(x, out.int=FALSE, full_print=TRUE, log = eval.parent(in_log_default))
}
\description{
Encodes binary data into 0 and 1. Optionally records the result into a log file.
Expand All @@ -14,7 +14,7 @@ Encodes binary data into 0 and 1. Optionally records the result into a log file.

\item{full_print}{When set to \code{FALSE}, only print minimum information. A full output includes summary of \code{x} before and after encoding.}

\item{log}{Controls log files. If \code{FALSE}, no log file. To produce log files, pass a list of \code{file}, \code{append}, and \code{split} as arguments for \code{sink()}.}
\item{log}{Controls log files. To produce log files, assign it or the \code{log_arg} variable in the parent environment (dynamic scope) a list of arguments for \code{sink()}, such as \code{file}, \code{append}, and \code{split}.}
}
\value{
An encoded data frame.
Expand Down
5 changes: 3 additions & 2 deletions man/encode_onehot.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
\alias{encode_onehot}
\title{One-Hot Encoding}
\usage{
encode_onehot(x, colname.sep = '_', drop1st = FALSE, full_print=TRUE, log=FALSE)
encode_onehot(x, colname.sep = '_', drop1st = FALSE,
full_print=TRUE, log = eval.parent(in_log_default))
}
\description{
Encodes categorical data by One-hot encoding. Optionally records the result into a log file.
Expand All @@ -16,7 +17,7 @@ Encodes categorical data by One-hot encoding. Optionally records the result into

\item{full_print}{When set to \code{FALSE}, only print minimum information. A full output includes summary of \code{x} before and after encoding.}

\item{log}{Controls log files. If \code{FALSE}, no log file. To produce log files, pass a list of \code{file}, \code{append}, and \code{split} as arguments for \code{sink()}.}
\item{log}{Controls log files. To produce log files, assign it or the \code{log_arg} variable in the parent environment (dynamic scope) a list of arguments for \code{sink()}, such as \code{file}, \code{append}, and \code{split}.}
}
\value{
An encoded data frame.
Expand Down
5 changes: 3 additions & 2 deletions man/encode_ordinal.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ Encode Ordinal Data Into Integers
Encodes ordinal data into sequential integers by a given order. Optionally records the result into a log file.
}
\usage{
encode_ordinal(x, order, none='', out.int=FALSE, full_print=TRUE, log=FALSE)
encode_ordinal(x, order, none='', out.int=FALSE,
full_print=TRUE, log = eval.parent(in_log_default))
}
\arguments{
\item{x}{The data frame}
Expand All @@ -20,7 +21,7 @@ encode_ordinal(x, order, none='', out.int=FALSE, full_print=TRUE, log=FALSE)

\item{full_print}{When set to \code{FALSE}, only print minimum information. A full output includes summary of \code{x} before and after encoding.}

\item{log}{Controls log files. If \code{FALSE}, no log file. To produce log files, pass a list of \code{file}, \code{append}, and \code{split} as arguments for \code{sink()}.}
\item{log}{Controls log files. To produce log files, assign it or the \code{log_arg} variable in the parent environment (dynamic scope) a list of arguments for \code{sink()}, such as \code{file}, \code{append}, and \code{split}.}
}
\value{
An encoded data frame.
Expand Down
10 changes: 5 additions & 5 deletions man/impute.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@
\alias{impute_mean}
\title{Impute Missing Values}
\usage{
impute_mode(x,cols=colnames(x),idx=row.names(x),log=FALSE)
impute_mode(x,cols=colnames(x),idx=row.names(x),log = eval.parent(in_log_default))

impute_median(x,cols=colnames(x),idx=row.names(x),log=FALSE)
impute_median(x,cols=colnames(x),idx=row.names(x),log = eval.parent(in_log_default))

impute_mean(x,cols=colnames(x),idx=row.names(x),log=FALSE)
impute_mean(x,cols=colnames(x),idx=row.names(x),log = eval.parent(in_log_default))
}
\description{
\code{impute_mode}: Impute \code{NA}s by the modes of their corresponding columns.
Expand All @@ -23,9 +23,9 @@ impute_mean(x,cols=colnames(x),idx=row.names(x),log=FALSE)

\item{cols}{The index of columns of \code{x} to be imputed.}

\item{idx}{The index of rows of \code{x} to be used to calculate the values to impute \code{NA}s. Use this parameter to prevent leakage.}
\item{idx}{The index of rows of \code{x} to be used to calculate the values to impute \code{NA}s. Use this parameter to prevent leakage.}

\item{log}{Controls log files. If \code{FALSE}, no log file. To produce log files, pass a list of \code{file}, \code{append}, and \code{split} as arguments for \code{sink()}.}
\item{log}{Controls log files. To produce log files, assign it or the \code{log_arg} variable in the parent environment (dynamic scope) a list of arguments for \code{sink()}, such as \code{file}, \code{append}, and \code{split}.}
}
\value{
An imputed data frame.
Expand Down
8 changes: 4 additions & 4 deletions man/partition_random.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
\alias{partition_random}
\title{Partitioning A Dataset Randomly}
\usage{
partition_random(x, name = 'Partition', train,
val = 10^ceiling(log10(train))-train,
test = TRUE, seed = FALSE, log=FALSE)
partition_random(x, name = 'Partition', train,
val = 10^ceiling(log10(train))-train, test = TRUE,
seed = FALSE, log = eval.parent(in_log_default))
}
\description{
Designed to create a validation column. Optionally records the result into a log file.
Expand All @@ -22,7 +22,7 @@ Designed to create a validation column. Optionally records the result into a log

\item{seed}{Whether to set a random seed. If you want a reproducible result, pass a number to \code{seed} as the random seed.}

\item{log}{Controls log files. If \code{FALSE}, no log file. To produce log files, pass a list of \code{file}, \code{append}, and \code{split} as arguments for \code{sink()}.}
\item{log}{Controls log files. To produce log files, assign it or the \code{log_arg} variable in the parent environment (dynamic scope) a list of arguments for \code{sink()}, such as \code{file}, \code{append}, and \code{split}.}
}
\value{
A partitioned column.
Expand Down
Loading

0 comments on commit 191ecc8

Please sign in to comment.