@@ -92,33 +92,41 @@ Read_Bert_Code/bert_read_step_to_step/chineseGLUEdatasets/tnews
92
92
```
93
93
> 对于字典中不存在的词 , 用 ` [UNK] ` 表示, 对应的id为 100
94
94
95
- 3 .
95
+ 3 . 过长截断策略
96
96
97
- 特殊token id:
98
- - ` [CLS] ` : 101
99
- - ` [SEP] ` : 102
100
- - ` [MASK] ` : 103
101
- - ` [UNK] ` : 100
102
- - ` [PAD] ` : 0
97
+ 4 . 添加特殊Token标记
98
+
99
+ ![ 原序列添加特殊Token标记图] ( 图解BERT/1.png )
100
+
101
+ ``` json
102
+ [101 , 5500 , 4873 , 704 , 4638 , 4960 , 4788 , 2501 , 2578 , 102 ]
103
+ ```
104
+
105
+ > BertTokenizer中的特殊token id:
106
+ > - ` [CLS] ` : 101
107
+ > - ` [SEP] ` : 102
108
+ > - ` [MASK] ` : 103
109
+ > - ` [UNK] ` : 100
110
+ > - ` [PAD] ` : 0
103
111
104
112
``` python
105
- class BertTokenizer (PreTrainedTokenizer ):
106
- ...
107
-
113
+ # BertTokenizer
108
114
def build_inputs_with_special_tokens (self , token_ids_0 , token_ids_1 = None ):
109
- """
110
- Build model inputs from a sequence or a pair of sequence for sequence classification tasks
111
- by concatenating and adding special tokens.
112
- A BERT sequence has the following format:
113
- single sequence: [CLS] X [SEP]
114
- pair of sequences: [CLS] A [SEP] B [SEP]
115
- """
116
115
if token_ids_1 is None :
117
116
return [self .cls_token_id] + token_ids_0 + [self .sep_token_id]
118
117
cls = [self .cls_token_id]
119
118
sep = [self .sep_token_id]
120
119
return cls + token_ids_0 + sep + token_ids_1 + sep
120
+ ```
121
+ 5 . 创建句子辨识列表,用以区分不同的句子
122
+
123
+ ![ token_type_ids作用图解] ( 图解BERT/2.png )
121
124
125
+ ``` json
126
+ [0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ]
127
+ ```
128
+ ``` python
129
+ # BertTokenizer
122
130
def create_token_type_ids_from_sequences (self , token_ids_0 , token_ids_1 = None ):
123
131
"""
124
132
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
@@ -133,30 +141,25 @@ class BertTokenizer(PreTrainedTokenizer):
133
141
if token_ids_1 is None :
134
142
return len (cls + token_ids_0 + sep) * [0 ]
135
143
return len (cls + token_ids_0 + sep) * [0 ] + len (token_ids_1 + sep) * [1 ]
144
+ ```
145
+ 6 . 创建用以区分special tokens部分的mask列表
136
146
137
- def get_special_tokens_mask (self , token_ids_0 , token_ids_1 = None , already_has_special_tokens = False ):
138
- """
139
- Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
140
- special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
141
-
142
- Args:
143
- token_ids_0: list of ids (must not contain special tokens)
144
- token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
145
- for sequence pairs
146
- already_has_special_tokens: (default False) Set to True if the token list is already formated with
147
- special tokens for the model
148
-
149
- Returns:
150
- A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
151
- """
152
-
153
- if already_has_special_tokens:
154
- if token_ids_1 is not None :
155
- raise ValueError (" You should not supply a second sequence if the provided sequence of "
156
- " ids is already formated with special tokens for the model." )
157
- return list (map (lambda x : 1 if x in [self .sep_token_id, self .cls_token_id] else 0 , token_ids_0))
147
+ ![ special_tokens_mask作用图解] ( 图解BERT/3.png )
158
148
149
+ ``` python
150
+ # BertTokenizer
151
+ def get_special_tokens_mask (self , token_ids_0 , token_ids_1 = None , already_has_special_tokens = False ):
159
152
if token_ids_1 is not None :
160
153
return [1 ] + ([0 ] * len (token_ids_0)) + [1 ] + ([0 ] * len (token_ids_1)) + [1 ]
161
154
return [1 ] + ([0 ] * len (token_ids_0)) + [1 ]
162
- ```
155
+ ```
156
+ 7 . 超长截断
157
+
158
+ ``` python
159
+ # PreTrainedTokenizer
160
+ if max_length and len (encoded_inputs[" input_ids" ]) > max_length:
161
+ encoded_inputs[" input_ids" ] = encoded_inputs[" input_ids" ][:max_length]
162
+ encoded_inputs[" token_type_ids" ] = encoded_inputs[" token_type_ids" ][:max_length]
163
+ encoded_inputs[" special_tokens_mask" ] = encoded_inputs[" special_tokens_mask" ][:max_length]
164
+ ```
165
+
0 commit comments