diff --git a/R_notes_version_0/R_Python.qmd b/R_notes_version_0/R_Python.qmd deleted file mode 100644 index 6c12425..0000000 --- a/R_notes_version_0/R_Python.qmd +++ /dev/null @@ -1,210 +0,0 @@ ---- -execute: - eval: false ---- - -# R⬌Python {.unnumbered} - -## 对象转换 - -| R | Python | 例 | -|------------------------|------------------------|------------------------| -| 单元素向量 | 标量Scalar | `1`、 `1L`、`TRUE`、`"foo"` | -| 未命名列表或多元素向量 | List | `c(1.0, 2.0, 3.0)`, `c(1L, 2L, 3L)` | -| 命名列表 | Dict | `list(a = 1L, b = 2.0)`, `dict(x = x_data)` | -| Matrix/Array | NumPy ndarray | `matrix(c(1,2,3,4), nrow = 2, ncol = 2)` | -| Data Frame | Pandas DataFrame | `data.frame(x = c(1,2,3), y = c("a", "b", "c"))` | -| Function | Python function | `function(x) x + 1` | -| NULL, TRUE, FALSE | None, True, False | `NULL`, `TRUE`, `ALSE` | - -: Type conversions - -## [`reticulate::`](https://rstudio.github.io/reticulate/index.html){.uri} - -```{r} -#| comment: "#>" -library(reticulate) -#devtools::install_version( "ggmap", version = "3.5.2") - -"R code" - - -``` - -```{r} -#| comment: "###>" - -"python code" -``` - -### **R 安装 python 模块** - -[reticulate:安装Python module](https://rstudio.github.io/reticulate/articles/python_packages.html) - -```{r eval=FALSE} -#| comment: "#>" -library(reticulate) -py_config() -py_module_available('pip') - -# Anaconda 激活环境 pip install scanpy -i https://pypi.tuna.tsinghua.edu.cn/simple/ ,依赖包含 numpy pandas -py_module_available('numpy') -py_module_available('pandas') -py_module_available('scanpy') -reticulate::repl_python() -``` - -### R 调用 Python 模块 - -```{r eval=FALSE} -#| comment: "#>" -# 调用os模块(module)的listdir()函数 -os <- reticulate::import("os") -os$listdir("./") - -# 调用seaborn模块的load_dataset()函数 -# 需要seaborn模块已安装 -sns <- import("seaborn") -tips <- sns$load_dataset("tips") -print(head(tips)) -``` - -### **R** → Python - -```{r} -#| comment: "#>" -A <- 1 -B <- c(1, 2, 3) -C <- c(a = 1, b = 2, c = 3) -D <- matrix(1:4, nrow = 2) -E <- data.frame(a = c(1, 2), b = c(3, 4)) -G <- list(1, 2, 3) -H <- list(c(1, 2), c(3, 4)) -I <- list(a = c(1, 2), b = c(3, 4)) -J <- function(a, b) { - return(a + b) - } -K1 <- NULL -K2 <- T -K3 <- F -``` - -```{python} -### float -r.A -type(r.A) - -### list -r.B -type(r.B) - -r.C -type(r.C) - -### numpy.ndarray -r.D -type(r.D) - -### pandas.core.frame.DataFrame -r.E -type(r.E) - -### list -r.G -type(r.G) - -r.H -type(r.H) - -### dict -r.I -type(r.I) - -### function -r.J -type(r.J) -r.J(2, 3) - - -### NoneType -r.K1 -type(r.K1) - - -### bool -r.K2 -type(r.K2) -r.K3 -type(r.K3) -``` - -### **Python** → R - -```{python} -import pandas as pd -m = [1, 2, 3] -n = pd.DataFrame([[1, 2], [3, 4]], columns=["a", "b"]) - -A = 1 -B = [1, 2, 3] -C = [[1, 2], [3, 4]] -D1 = [[1], 2, 3] -D2 = [[1, 2], 2, 3] - -E = (1, 2, 3) - -FF = ((1, 2), (3, 4)) - -G = ((1, 2), 3, 4) - -H = {"a": [1, 2, 3], "b": [2, 3, 4]} - -I = {"a": 1,"b": [2, 3, 4]} - -def J(a, b): - return a + b -``` - -```{r} -#| comment: "#>" - -py$m -py$n - -### integer -py$A -class(py$A) - -py$B -class(py$B) - -### list -py$C -class(py$C) - -py$D1 -class(py$D1) - -py$D2 -class(py$D2) - -py$E -class(py$E) - -py$FF -class(py$FF) - -py$G -class(py$G) - -py$H -class(py$H) - -py$I -class(py$I) - -### function -py$J -class(py$J) -py$J(2, 3) -``` diff --git a/R_notes_version_0/Rcpp.qmd b/R_notes_version_0/Rcpp.qmd deleted file mode 100644 index efe4701..0000000 --- a/R_notes_version_0/Rcpp.qmd +++ /dev/null @@ -1,73 +0,0 @@ -# Rcpp - -[Rcpp](http://www.rcpp.org/) - -[Rcpp4everyone](https://teuder.github.io/rcpp4everyone_en/) - - - - - -## 数据结构 - -### 向量类 - -`NumericVector` `IntegerVector` `CharacterVector` `LogicalVector` - -```{r} -library(Rcpp) -sourceCpp("function/sum_cpp.cpp") - -sum_cpp(mpg$displ) -sum(mpg$displ) - -sourceCpp("function/mean_cpp.cpp") -mean_cpp(mpg$displ) -``` - -`NumericMatrix` `IntegerMatrix` `CharacterMatrix` `LogicalMatrix` - -### 数据框类 - -DataFrame - -### 列表类 - -List - -```{r} -# 平均百分比误差 -sourceCpp("function/mpe.cpp") -mod <- lm(mpg ~ wt, data = mtcars) -mpe(mod) -``` - -### 函数类 - -```{r} -cppFunction("RObject callWithOne(Function f) { - return f(1); -}") - -callWithOne(function(x) x + 1) -callWithOne(paste) -``` - -### 属性 - -```{r} -cppFunction('NumericVector attribs() { - NumericVector out = NumericVector::create(1, 2, 3); - - out.names() = CharacterVector::create("a", "b", "c"); - out.attr("my-attr") = "my-value"; - out.attr("class") = "my-class"; - - return out; -}') - - -attribs() -``` - -## Rcpp sugar diff --git a/R_notes_version_0/R_notes.Rproj b/R_notes_version_0/Rdatascience.Rproj similarity index 100% rename from R_notes_version_0/R_notes.Rproj rename to R_notes_version_0/Rdatascience.Rproj diff --git a/R_notes_version_0/_quarto.yml b/R_notes_version_0/_quarto.yml index 025c22b..4e486f0 100644 --- a/R_notes_version_0/_quarto.yml +++ b/R_notes_version_0/_quarto.yml @@ -3,7 +3,7 @@ project: output-dir: docs book: - title: "R数据科学与编程" + title: "R数据科学" reader-mode: true chapters: - index.qmd @@ -22,17 +22,7 @@ book: - transform_tidy.qmd - functionals.qmd - dynamic_report.qmd - - - part: "大数据" - chapters: - - data.table.qmd - - arrow.qmd - - polars.qmd - - parallel_computing.qmd - - spark.qmd - - Rcpp.qmd - - R_Python.qmd - part: "编程" diff --git a/R_notes_version_0/arrow.qmd b/R_notes_version_0/arrow.qmd deleted file mode 100644 index beca635..0000000 --- a/R_notes_version_0/arrow.qmd +++ /dev/null @@ -1,21 +0,0 @@ -# Arrow - -- [arrow.apache.R](https://arrow.apache.org/docs/r/) - -## 安装 - -[apache-arrow.tar.gz](https://github.com/apache/arrow/releases/) - -```{r include=FALSE} -options(timeout = 300) - -if(!require(arrow)) install.packages("arrow") -``` - -read_parquet():读取 Parquet 格式的文件 - -read_delim_arrow():读取带分隔符的文本文件 - -read_csv_arrow():读取逗号分隔值 (CSV) 文件 - -read_tsv_arrow():读取制表符分隔值 (TSV) 文件 diff --git a/R_notes_version_0/data.table.qmd b/R_notes_version_0/data.table.qmd deleted file mode 100644 index 3a50510..0000000 --- a/R_notes_version_0/data.table.qmd +++ /dev/null @@ -1,49 +0,0 @@ -# `data.table` - -- [data.table](https://rdatatable.gitlab.io/data.table/) -- [dtplyr](https://dtplyr.tidyverse.org/) data.table back-end for 'dplyr' - -```{r} -library(data.table) -library(dtplyr) -library(dplyr, warn.conflicts = FALSE) -``` - -## dtplyr 语法 - -DTPLYR 使用 DPLYR 的语法s实现 [data.table](http://r-datatable.com/) 的速度;编写 DPLYR(和 TidyR)代码,DTPLYR 将其转换为等效的 Data.Table。 - -```{r} -mtcars2 <- lazy_dt(mtcars) -``` - -```{r} -dtplyr <- mtcars2 %>% - filter(wt < 5) %>% - mutate(l100k = 235.21 / mpg) %>% # liters / 100 km - group_by(cyl) %>% - summarise(l100k = mean(l100k)) -dtplyr - -dtplyr %>% show_query() - -dtplyr %>% as_tibble() -``` - -```{r} -dt <- data.table::as.data.table(mtcars) -dt[wt<5][, `:=`(l100k = 235.21/mpg)][, .(l100k = mean(l100k)), keyby = .(cyl)] -``` - -## data.table 语法 - -``` -DT[i, j, by] - -## R: i j by -## SQL: where | order by select | update group by -``` - -![](images/data.table_cheatsheet.jpg){fig-align="center"} - - diff --git a/R_notes_version_0/parallel_computing.qmd b/R_notes_version_0/parallel_computing.qmd deleted file mode 100644 index d094518..0000000 --- a/R_notes_version_0/parallel_computing.qmd +++ /dev/null @@ -1,46 +0,0 @@ -# 并行计算 - -Parallel computing - -## 硬件 - -```{r} -if(!require(benchmarkme)) install.packages("benchmarkme") -``` - -1字节(byte)= 8位(bit) 二进制数 = 1个 ASCII 字符。 - -International System of Units (SI) - -```{r} -# 随机存取存储器 Random access memory,RAM -get_ram() - - -# CPU -get_cpu() -``` - -standard hard disk drives (HDDs) - -Solid state drives (SSDs) - -## 并行计算 - -![](images/parallel_computation.jpeg){fig-align="center"} - -Cluster - -Master - -Worker - -Job -\> (Spilt) -\> task 1,...,n, -\> (feed) -\> R worker - - - -### **`furrr`** 包 - -### **`future.apply`** 包 - -### **`BiocParallel`** 包 diff --git a/R_notes_version_0/polars.qmd b/R_notes_version_0/polars.qmd deleted file mode 100644 index 7bae49c..0000000 --- a/R_notes_version_0/polars.qmd +++ /dev/null @@ -1,65 +0,0 @@ -# Polars - -## [polars](https://docs.pola.rs/) - -- [R polars](https://pola-rs.github.io/r-polars/) - -- [Cookbook Polars for R](https://ddotta.github.io/cookbook-rpolars/) - -```{r} -if(!require(polars)) - install.packages("polars", repos = "https://rpolars.r-universe.dev") - -polars_info() -polars_code_completion_activate() -``` - -Polars 的主要函数存储在 “pl” 命名空间中,可以使用 “`pl$`” 前缀进行访问,以防止与其他组件和base R 函数名称冲突 - -```{r} -iris_polars <- pl$DataFrame(iris) -iris_polars -``` - -访问属性 - -```{r} -iris_polars$shape -iris_polars$height -iris_polars$width - -# polars syntax -pl$DataFrame(iris)$ - select(c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width"))$ - with_columns( - pl$when( - (pl$col("Petal.Length") / pl$col("Petal.Width") > 3) - )$then(pl$lit("long"))$ - otherwise(pl$lit("large"))$ - alias("petal_type") - )$ - filter(pl$col("Sepal.Length")$is_between(4.5, 5.5))$ - head(6) -``` - -```{r} -pl$read_csv("data/Advertising.csv") -``` - -## [tidypolars](https://tidypolars.etiennebacher.com/) - -```{r} -if(!require(tidypolars)) - install.packages('tidypolars', - repos = c('https://etiennebacher.r-universe.dev', getOption("repos")) - ) - -iris |> - as_polars_df() |> - select(starts_with(c("Sep", "Pet"))) |> - mutate( - petal_type = ifelse((Petal.Length / Petal.Width) > 3, "long", "large") - ) |> - filter(between(Sepal.Length, 4.5, 5.5)) |> - head() -``` diff --git a/R_notes_version_0/spark.qmd b/R_notes_version_0/spark.qmd deleted file mode 100644 index 1629d17..0000000 --- a/R_notes_version_0/spark.qmd +++ /dev/null @@ -1,141 +0,0 @@ ---- -execute: - eval: false ---- - -# Spark - - - -Apache Spark 是用于大规模数据处理的统一分析引擎。 - -Spark 提供了一组超出 MapReduce 的更丰富的动词,以方便优化在多台计算机中运行的代码。Spark 还将数据加载到内存中,使操作速度比 Hadoop 的磁盘存储快得多。 - -![](images/clipboard-2740544019.png){fig-align="center" width="60%"} - -## 安装 - -### java 8 - - - -```{r} -system(command = "E:/java/bin/java.exe -version",intern = T) %>% cat(.,sep = "\n") - - -# 在 R 中临时设置 JAVA_HOME 环境变量 -# Sys.setenv(JAVA_HOME = "E:/java") - -``` - -### sparklyr - -```{r} -#install.packages("sparklyr") -packageVersion("sparklyr") -``` - -### spark - -```{r} -library(sparklyr) -# C:\\Users\\DELL\\AppData\\Local/spark -options(spark.install.dir = "E:/spark/") -spark_install_dir() -# spark_available_versions() - -#spark_install(version = "3.3") -spark_installed_versions() - - -# spark_uninstall(version = "1.6.3", hadoop = "2.6") -``` - -## 连接 - -```{r} -library(sparklyr) -sc <- spark_connect(master = "local") -``` - -## 使用 - -```{r} -cars <- copy_to(sc, mtcars) - -cars -``` - -```{r} -library(dplyr) -select(cars, hp, mpg) %>% - sample_n(100) %>% - collect() %>% - plot() -``` - -```{r} -model <- ml_linear_regression(cars, mpg ~ hp) -model - -model %>% - ml_predict(copy_to(sc, data.frame(hp = 250 + 10 * 1:10))) %>% - transmute(hp = hp, mpg = prediction) %>% - full_join(select(cars, hp, mpg)) %>% - collect() %>% - plot() -``` - -```{r eval=FALSE} -spark_write_csv(cars, "data/spark/cars.csv") - -cars <- spark_read_csv(sc, "data/spark/cars.csv") -``` - -### 分布式 - -```{r} -cars %>% spark_apply(~round(.x)) -``` - -### 流 - -```{r eval=FALSE} -dir.create("data/spark/input") -dir.create("data/spark/output") -write.csv(mtcars, "data/spark/input/cars_1.csv", row.names = F) - - -stream <-stream_read_csv(sc, "data/spark/input/") %>% - select(mpg, cyl, disp) %>% - stream_write_csv("data/spark/output/") - -dir("data/spark/output", pattern = ".csv") - - -write.csv(mtcars, "data/spark/input/cars_2.csv", row.names = F) - -# 几秒钟后 -dir("data/spark/output", pattern = ".csv") - - -stream_stop(stream) - -file.remove("data/spark/input") -file.remove("data/spark/output") -``` - -## Web 界面 - -```{r eval=FALSE} -spark_web(sc) -``` - -![](images/clipboard-2463586693.png) - -## 断开连接 - -```{r} -spark_disconnect(sc) -spark_disconnect_all() -```