0

wanganlin00 · Dec 19, 2024 · 1c2dccc · 1c2dccc
1 parent be42893
commit 1c2dccc
Show file tree

Hide file tree

Showing 6 changed files with 104 additions and 79 deletions.
diff --git a/R_data_science/_quarto.yml b/R_data_science/_quarto.yml
@@ -7,21 +7,22 @@ book:
   reader-mode: true
   chapters:
     - index.qmd
-    - intro.qmd
+    - config.qmd
     - data_type.qmd
     - data_structure.qmd
     - read_write.qmd
     - base_function.qmd
 
 
+    - transform_tidy.qmd
     - numeric.qmd
     - logical_operation.qmd
     - datetime.qmd
-    - regular_expressions.qmd
+    - regular_expression.qmd
     - string.qmd
 
 
-    - transform_tidy.qmd
+
     - functionals.qmd
     - dynamic_report.qmd
 

diff --git a/R_data_science/base_function.qmd b/R_data_science/base_function.qmd
@@ -10,16 +10,23 @@
 | `<<-` | Left lexicographic assignment (for advanced users) |
 | `->>` | Right lexicographic assignment (for advanced users) |
 
-## 其他运算符
+## 子集运输符
+
+|  |  |
+|----|----|
+| \[ |  Operators acting on vectors, matrices, arrays and lists to extract or replace parts |
+| \[\[ | extract value on  list, dataframe |
+| \$ | Named list or dataframe column subset |
+| \@ | Accessing slots in S4 classes (Advanced) |
+
+## 特殊运算符
 
 | **Miscellaneous operator in R** | **Description** |
 |:--:|:--:|
-| \$ | Named list or dataframe column subset |
 | : | Sequence generator |
 | :: | Accessing functions of packages It is not usually needed |
 | ::: | Accessing internal functions of packages |
 | \~ | Model formulae |
-| \@ | Accessing slots in S4 classes (Advanced) |
 
 ## 逻辑索引
 
@@ -55,45 +62,6 @@ xu <- x[!duplicated(x)]
 xu
 ```
 
-## 子集运算
-
-`[`，`$`，`[[`
-
-S4 对象： `@` `slot`
-
-### `[ ]`
-
-```{r}
-x <- c(2.1, 4.2, 3.3, 5.4)
-
-x[c(3, 1)]
-
-x[-c(3, 1)]
-
-x[c(TRUE, TRUE, FALSE, FALSE)]
-x[x > 3]
-
-x[c(TRUE, FALSE)] # recycling rules 循环
-
-x[]
-
-x[0]
-```
-
-### `$`, `[[ ]]`
-
-`$`是一个简写运算符, `x$y`大致相当于 `x[["y"]]` ,从左到右部分赋值
-
-```{r}
-x <- list(abc = 1)
-x$a
-
-x[["a"]]
-
-options(warnPartialMatchDollar = TRUE)
-x$a
-```
-
 ## 生成数值序列
 
 ```{r}

diff --git a/R_data_science/intro.qmd → R_data_science/config.qmd b/R_data_science/intro.qmd → R_data_science/config.qmd
diff --git a/R_data_science/data_structure.qmd b/R_data_science/data_structure.qmd
@@ -414,6 +414,9 @@ attributes(df2)
 
 names(tibble(`1` = 1))
 
+# 循环补齐(recycling)
+data.frame(x = 1:4, y = 1, z=1:2)
+
 tibble(x = 1:4, y = 1)
 tibble(x = 1:4, y = 1:2)
 

diff --git a/R_data_science/regular_expressions.qmd → R_data_science/regular_expression.qmd b/R_data_science/regular_expressions.qmd → R_data_science/regular_expression.qmd
@@ -37,9 +37,6 @@ str_view(x, "\\d{2,}")
 str_view(x, "\\d{2,3}")
 ```
 
-```         
-```
-
 ### 懒惰量词
 
 在满足条件的情况下会尽量少地匹配
@@ -72,6 +69,11 @@ str_view(x, "ab*?")  # 匹配a
 | \\         | 对特殊字符进行转义，\[\\\^\\-\\\]\] 匹配 \^、-和 \] |
 
 ```{r}
+
+c("s.d \n zxc") %>%
+  str_view(".")
+
+
 str_view(words, "[aeiou]x[aeiou]")  #匹配 中间x两边元音
 str_view(words, "[^aeiou]y[^aeiou]") #匹配 中间y两边辅音
 str_view(fruit, "a...e")
@@ -87,7 +89,6 @@ str_view(fruit, "apple|melon|nut")
 str_view(fruit, "aa|ee|ii|oo|uu")
 ```
 
-
 ### Unicode字符
 
 -   **基本 Unicode 范围匹配**：对于支持 Unicode 的正则表达式引擎，可以匹配更广泛的字符范围。例如，`\p{L}`匹配任何语言中的字母字符。`\p{Nd}`匹配任何十进制数字（相当于`[0 - 9]`的 Unicode 版本）。
@@ -146,10 +147,6 @@ str_view(x, "\\bsum\\_\\b")
 
 ```
 
-
-
-
-
 ## 零宽匹配
 
 断言（assertions）是一种零宽度（zero - width）的匹配，它用于指定一个位置，这个位置应该满足某种条件，但不会消耗（匹配）任何字符。
@@ -167,6 +164,14 @@ str_replace_all(
 
 ### 断言assertions
 
+-   `(?=pattern)` 要求此位置的后面必须匹配表达式pattern
+
+-   `(?!pattern)` 要求此位置的后面不能匹配表达式pattern
+
+-   `(?<=pattern)` 要求此位置的前面必须匹配表达式pattern
+
+-   `(?<!pattern)` 要求此位置的前面不能匹配表达式pattern
+
 ```{r}
 x <- c("ocauuno", "0on0242uh","nauguio","nucu0ono0huhun")
 
@@ -190,6 +195,13 @@ str_view(x,pattern = "(?<!0o)n")  # 匹配前面不是0o的n
 
 反向引用（Backreference）允许引用之前在正则表达式中匹配到的子表达式。简单来说，就是可以在正则表达式的后续部分重复使用之前已经匹配的内容。它通过使用`\数字`的形式来实现，其中 “数字” 表示之前捕获组的编号。
 
+```{r}
+# 有哪些字母是重复两次的
+str_view(fruit, "(.)\\1", match = TRUE)
+
+str_view(fruit, "(..)\\1", match = TRUE)
+```
+
 捕获组是通过在正则表达式中使用括号()来定义的。例如，在正则表达式(\d{3})-(\d{4})中，(\d{3})是第一个捕获组，(\d{4})是第二个捕获组。捕获组的编号是从左到右，从 1 开始计数。当这个正则表达式匹配一个电话号码，如123 - 4567时，123被存储在第一个捕获组中，4567被存储在第二个捕获组中。
 
 ```{r}
@@ -199,7 +211,6 @@ str_replace(string = c("123 - 4567 - 0030 - 789",
          replacement = "backreference")
 ```
 
-
 有时候，我们可能不希望捕获组存储匹配内容，但又想使用分组来控制优先级或者方便使用反向引用。这时候可以使用非捕获组，其语法是(?:...)。例如，在正则表达式(?:\d{3})-(\d{4})中，(?:\d{3})是一个非捕获组，它匹配 3 位数字但不存储，只有(\d{4})是捕获组。如果要在后续部分引用\d{3}部分的内容，就不能使用反向引用，因为它没有被捕获。
 
 ```{r}
@@ -208,4 +219,3 @@ str_replace(string = c("123 - 4567 - 0030 - 789",
          pattern = "(\\d{3}) - (?:\\d{4}) - (?:\\d{5}) - \\1",
          replacement = "no backreference")
 ```
-
diff --git a/R_data_science/transform_tidy.qmd b/R_data_science/transform_tidy.qmd
@@ -1,16 +1,46 @@
-# 数据清洗
+# 数据整理
 
-```{r include=FALSE}
-if(!require(DT)) install.packages("DT")
+## 缺失值
+
+
+
+```{r}
+students <- read_csv("data/students.csv", na = c("N/A", ""))
+students
+```
+
+
+## 重命名
+
+```{r}
 if(!require(janitor)) install.packages("janitor")
+
+# snake_case
+students |>
+    janitor::clean_names(case="snake") #"title"  "lower_camel" "upper_camel"
+```
+
+## 重编码
+
+```{r}
+students$AGE[5] <- 5
+
+students
 ```
 
+
+
+
 ## 行操作**dplyr**
 
 <https://dplyr.tidyverse.org/index.html> "分割-应用-组合"（Split-Apply-Combine）
 
 ### `filter()`
 
+```{r include=FALSE}
+if(!require(DT)) install.packages("DT")
+```
+
 ```{r}
 dplyr::filter(mpg,model=="a4")
 
@@ -163,6 +193,10 @@ where(is.logical) selects all logical columns.
 
 ### `across(.cols, .fns, ...)`
 
+用在 mutate() 和summarise() 函数里面
+
+across() 对多列执行相同的函数操作，返回数据框
+
 ```{r}
 iris <- as_tibble(iris)
 iris %>%
@@ -432,6 +466,34 @@ who2 |>
   ) |> head()
 ```
 
+```{r}
+plant_record <- data.frame(
+         day = c(1L, 2L, 3L, 4L, 5L),
+    A_height = c(1.1, 1.2, 1.3, 1.4, 1.5),
+     A_width = c(2.1, 2.2, 2.3, 2.4, 2.5),
+     A_depth = c(3.1, 3.2, 3.3, 3.4, 3.5),
+    B_height = c(4.1, 4.2, 4.3, 4.4, 4.5),
+     B_width = c(5.1, 5.2, 5.3, 5.4, 5.5),
+     B_depth = c(6.1, 6.2, 6.3, 6.4, 6.5),
+    C_height = c(7.1, 7.2, 7.3, 7.4, 7.5),
+     C_width = c(8.1, 8.2, 8.3, 8.4, 8.5),
+     C_depth = c(9.1, 9.2, 9.3, 9.4, 9.5)
+)
+plant_record %>% 
+  knitr::kable()
+
+
+plant_record %>% 
+  tidyr::pivot_longer(
+    cols = !day,
+    names_to = c("species", "parameter"),
+    names_pattern = "(.*)_(.*)",
+    values_to = "value"
+)
+```
+
+
+
 #### 列名包含变量名和变量值
 
 ```{r}
@@ -442,7 +504,7 @@ household
 household |> 
   pivot_longer(
     cols = !family, 
-    names_to = c(".value", "child"), # 使用透视列名称的第一个组件作为变量名称
+    names_to = c(".value", "child"), # 使用列名的第一个组件作为变量名称
     names_sep = "_", 
     values_drop_na = TRUE
   )
@@ -580,7 +642,7 @@ tibble(
           regex = "(.*)\\((.*)\\)")
 ```
 
-### unite()
+### 合并列 unite()
 
 ```{r}
 # 合并列
@@ -589,23 +651,4 @@ table5 %>%
   unite(year_Y,century,year,sep='')
 ```
 
-## 重编码
-
-```{r}
-students <- read_csv("data/students.csv", na = c("N/A", ""))
-students
-```
-
-#### 变量名
-
-```{r}
-if(!require(janitor)) install.packages("janitor")
-
-# snake_case
-students |>
-    janitor::clean_names(case="snake") #"title"  "lower_camel" "upper_camel"
-```
-
-#### 变量值
 
-##### 缺失值