code_format for chaper 1-6

huggingface · Nov 19, 2024 · 9623b46 · 9623b46
1 parent 14e8f4e
commit 9623b46
Show file tree

Hide file tree

Showing 5 changed files with 94 additions and 30 deletions.
diff --git a/chapters/zh-CN/chapter3/2.mdx b/chapters/zh-CN/chapter3/2.mdx
@@ -104,7 +104,7 @@ raw_datasets = load_dataset("glue", "mrpc")
 raw_datasets
 ```
 
-```python
+```python out
 DatasetDict({
     train: Dataset({
         features: ['sentence1', 'sentence2', 'label', 'idx'],
@@ -133,10 +133,12 @@ raw_train_dataset[0]
 ```
 
 ```python
-{'idx': 0,
- 'label': 1,
- 'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
- 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .'}
+{
+    "idx": 0,
+    "label": 1,
+    "sentence1": 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
+    "sentence2": 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
+}
 ```
 
 现在可以看到标签已经是整数了，因此不需要对标签做任何预处理。如果想要知道不同数字对应标签的实际含义，我们可以查看 `raw_train_dataset` 的 `features` 。这告诉我们每列的类型：
@@ -146,10 +148,14 @@ raw_train_dataset.features
 ```
 
 ```python
-{'sentence1': Value(dtype='string', id=None),
- 'sentence2': Value(dtype='string', id=None),
- 'label': ClassLabel(num_classes=2, names=['not_equivalent', 'equivalent'], names_file=None, id=None),
- 'idx': Value(dtype='int32', id=None)}
+{
+    "sentence1": Value(dtype="string", id=None),
+    "sentence2": Value(dtype="string", id=None),
+    "label": ClassLabel(
+        num_classes=2, names=["not_equivalent", "equivalent"], names_file=None, id=None
+    ),
+    "idx": Value(dtype="int32", id=None),
+}
 ```
 
 上面的例子中的 `Label（标签）` 是一种 `ClassLabel（分类标签）` ，也就是使用整数建立起类别标签的映射关系。 `0` 对应于 `not_equivalent（非同义）` ， `1` 对应于 `equivalent（同义）` 。
@@ -187,10 +193,26 @@ inputs
 ```
 
 ```python
-{ 
-  'input_ids': [101, 2023, 2003, 1996, 2034, 6251, 1012, 102, 2023, 2003, 1996, 2117, 2028, 1012, 102],
-  'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1],
-  'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+{
+    "input_ids": [
+        101,
+        2023,
+        2003,
+        1996,
+        2034,
+        6251,
+        1012,
+        102,
+        2023,
+        2003,
+        1996,
+        2117,
+        2028,
+        1012,
+        102,
+    ],
+    "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1],
+    "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 }
 ```
 
@@ -211,14 +233,46 @@ tokenizer.convert_ids_to_tokens(inputs["input_ids"])
 将得到：
 
 ```python
-['[CLS]', 'this', 'is', 'the', 'first', 'sentence', '.', '[SEP]', 'this', 'is', 'the', 'second', 'one', '.', '[SEP]']
+[
+    "[CLS]",
+    "this",
+    "is",
+    "the",
+    "first",
+    "sentence",
+    ".",
+    "[SEP]",
+    "this",
+    "is",
+    "the",
+    "second",
+    "one",
+    ".",
+    "[SEP]",
+]
 ```
 
 所以我们看到模型需要输入的形式是 `[CLS] sentence1 [SEP] sentence2 [SEP]` 。所以当有两句话的时候， `token类型ID(token_type_ids)` 的值是：
 
 ```python
-['[CLS]', 'this', 'is', 'the', 'first', 'sentence', '.', '[SEP]', 'this', 'is', 'the', 'second', 'one', '.', '[SEP]']
-[      0,      0,    0,     0,       0,          0,   0,       0,      1,    1,     1,        1,     1,   1,       1]
+[
+    "[CLS]",
+    "this",
+    "is",
+    "the",
+    "first",
+    "sentence",
+    ".",
+    "[SEP]",
+    "this",
+    "is",
+    "the",
+    "second",
+    "one",
+    ".",
+    "[SEP]",
+]
+[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
 ```
 
 现在输入中 `[CLS] sentence1 [SEP]` 它们的 `token_type_ids` 均为 `0` ，而其他部分例如 `sentence2 [SEP]` ，所有的 `token_type_ids` 均为 `1` 。
@@ -267,7 +321,7 @@ tokenized_datasets
 
 🤗Datasets 库进行这种处理的方式是向数据集添加新的字段，每个字段对应预处理函数返回的字典中的每个键：
 
-```python
+```python out
 DatasetDict({
     train: Dataset({
         features: ['attention_mask', 'idx', 'input_ids', 'label', 'sentence1', 'sentence2', 'token_type_ids'],
@@ -348,19 +402,23 @@ batch = data_collator(samples)
 {#if fw === 'tf'}
 
 ```python
-{'attention_mask': TensorShape([8, 67]),
- 'input_ids': TensorShape([8, 67]),
- 'token_type_ids': TensorShape([8, 67]),
- 'labels': TensorShape([8])}
+{
+    "attention_mask": TensorShape([8, 67]),
+    "input_ids": TensorShape([8, 67]),
+    "token_type_ids": TensorShape([8, 67]),
+    "labels": TensorShape([8]),
+}
 ```
 
 {:else}
 
 ```python
-{'attention_mask': torch.Size([8, 67]),
- 'input_ids': torch.Size([8, 67]),
- 'token_type_ids': torch.Size([8, 67]),
- 'labels': torch.Size([8])}
+{
+    "attention_mask": torch.Size([8, 67]),
+    "input_ids": torch.Size([8, 67]),
+    "token_type_ids": torch.Size([8, 67]),
+    "labels": torch.Size([8]),
+}
 ```
 
 看起来不错！现在，我们已经从原始文本转化为了模型可以处理的数据，我们准备好对其进行微调。

diff --git a/chapters/zh-CN/chapter5/3.mdx b/chapters/zh-CN/chapter5/3.mdx
@@ -102,6 +102,7 @@ DatasetDict({
 def lowercase_condition(example):
     return {"condition": example["condition"].lower()}
 
+
 drug_dataset.map(lowercase_condition)
 ```
 
@@ -281,8 +282,10 @@ new_drug_dataset = drug_dataset.map(
 
 ```python
 from transformers import AutoTokenizer
+
 tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
 
+
 def tokenize_function(examples):
     return tokenizer(examples["review"], truncation=True)
 ```
@@ -315,9 +318,11 @@ def tokenize_function(examples):
 ```py
 slow_tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", use_fast=False)
 
+
 def slow_tokenize_function(examples):
     return slow_tokenizer(examples["review"], truncation=True)
 
+
 tokenized_dataset = drug_dataset.map(slow_tokenize_function, batched=True, num_proc=8)
 ```
 

diff --git a/chapters/zh-CN/chapter6/2.mdx b/chapters/zh-CN/chapter6/2.mdx
@@ -76,7 +76,7 @@ def handle_simple_responses(
 ```py
 # 除非你的数据集很小,否则不要直接运行下面的代码!
 # training_corpus = [
-#     raw_datasets["train"][i: i + 1000]["whole_func_string"] 
+#     raw_datasets["train"][i: i + 1000]["whole_func_string"]
 #     for i in range(0, len(raw_datasets["train"]), 1000)
 # ]
 ```
@@ -253,7 +253,7 @@ tokenizer.push_to_hub("code-search-net-tokenizer")
 这将在你的账户中创建一个名为 `code-search-net-tokenizer` 的新仓库，其中将包含 tokenizer 文件。然后，你可以使用 tokenizer 的 `from_pretrained()` 方法从任何地方加载 tokenizer 。
 
 ```py
-# 将下面的 "huggingface-course" 替换为你的用户名来加载你的 tokenizer 
+# 将下面的 "huggingface-course" 替换为你的用户名来加载你的 tokenizer
 tokenizer = AutoTokenizer.from_pretrained("huggingface-course/code-search-net-tokenizer")
 ```
 

diff --git a/chapters/zh-CN/chapter6/3b.mdx b/chapters/zh-CN/chapter6/3b.mdx
@@ -183,7 +183,7 @@ import torch
 sequence_ids = inputs.sequence_ids()
 # 屏蔽除 context 之外的所有内容
 mask = [i != 1 for i in sequence_ids]
-# 不屏蔽 [CLS] token 
+# 不屏蔽 [CLS] token
 mask[0] = False
 mask = torch.tensor(mask)[None]
 
@@ -199,7 +199,7 @@ import tensorflow as tf
 sequence_ids = inputs.sequence_ids()
 # 屏蔽除 context 之外的所有内容
 mask = [i != 1 for i in sequence_ids]
-# 不屏蔽 [CLS] token 
+# 不屏蔽 [CLS] token
 mask[0] = False
 mask = tf.constant(mask)[None]
 

diff --git a/chapters/zh-CN/chapter6/7.mdx b/chapters/zh-CN/chapter6/7.mdx
@@ -318,7 +318,7 @@ def compute_scores(model):
     scores = {}
     model_loss = compute_loss(model)
     for token, score in model.items():
-        # 我们将保留长度为 1 的 tokens 
+        # 我们将保留长度为 1 的 tokens
         if len(token) == 1:
             continue
         model_without_token = copy.deepcopy(model)
@@ -372,6 +372,7 @@ def tokenize(text, model):
     encoded_words = [encode_word(word, model)[0] for word in pre_tokenized_text]
     return sum(encoded_words, [])
 
+
 tokenize("This is the Hugging Face course.", model)
 ```