BinaryOracle
diff --git a/‎src/LLM/图解BERT.md
Lines changed: 212 additions & 1 deletion b/‎src/LLM/图解BERT.md
Lines changed: 212 additions & 1 deletion
diff --git a/‎src/LLM/图解BERT/10.png
335 KB b/‎src/LLM/图解BERT/10.png
335 KB
diff --git a/‎src/LLM/图解BERT/11.png
227 KB b/‎src/LLM/图解BERT/11.png
227 KB
diff --git a/‎src/LLM/图解BERT/12.png
850 KB b/‎src/LLM/图解BERT/12.png
850 KB
diff --git a/‎src/LLM/图解BERT/13.png
1.21 MB b/‎src/LLM/图解BERT/13.png
1.21 MB
diff --git a/‎src/LLM/图解BERT/7.png
62.9 KB b/‎src/LLM/图解BERT/7.png
62.9 KB
diff --git a/‎src/LLM/图解BERT/8.png
298 KB b/‎src/LLM/图解BERT/8.png
298 KB
diff --git a/‎src/LLM/图解BERT/9.png
189 KB b/‎src/LLM/图解BERT/9.png
189 KB
@@ -204,4 +204,215 @@ features.append(
 dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_lens,all_labels)
 ```
 
-## 模型架构
+## 模型架构
+
+### DataLoader
+
+```python
+    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
+    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size,collate_fn=collate_fn)
+```
+DataLoader 设置的回调方法cllote_fn负责对返回的一个batch，在返回前进行预处理:
+
+```python
+def collate_fn(batch):
+    all_input_ids, all_attention_mask, all_token_type_ids, all_lens, all_labels = map(torch.stack, zip(*batch))
+    max_len = max(all_lens).item() # 计算当前批次中所有序列的实际最大长度
+    all_input_ids = all_input_ids[:, :max_len] # 按照本批次序列中最大长度进行截断: max_length --> max_len
+    all_attention_mask = all_attention_mask[:, :max_len]
+    all_token_type_ids = all_token_type_ids[:, :max_len]
+    return all_input_ids, all_attention_mask, all_token_type_ids, all_labels
+```
+
+### BertEmbeddings 
+
+![input embeddings =  token embeddings + segmentation embeddings + position embeddings](图解BERT/7.png)
+
+```python
+class BertEmbeddings(nn.Module):
+    def __init__(self, config):
+        super(BertEmbeddings, self).__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, input_ids, token_type_ids=None, position_ids=None):
+        seq_length = input_ids.size(1)
+        if position_ids is None: 
+            # 为当前批次中的每个序列样本生成一个位置序列: (1,2,3,4,5,...) , 构成一个位置序列矩阵
+            position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
+            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+
+        words_embeddings = self.word_embeddings(input_ids)
+        position_embeddings = self.position_embeddings(position_ids) # 位置编码为可学习的矩阵
+        token_type_embeddings = self.token_type_embeddings(token_type_ids) # 让模型自己学会区分不同的句子
+
+        embeddings = words_embeddings + position_embeddings + token_type_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+```
+
+![嵌入向量生成过程图](图解BERT/8.png)
+
+### BertEncoder
+
+#### BertLayer
+
+![BertLayer模型结构图](图解BERT/9.png)
+
+```python
+class BertIntermediate(nn.Module):
+    def __init__(self, config):
+        super(BertIntermediate, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size) # (768,3072)
+        # 激活函数 - GLEU
+        if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)  # 激活函数 - GLEU
+        return hidden_states
+
+class BertOutput(nn.Module):
+    def __init__(self, config):
+        super(BertOutput, self).__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size) # (3072,768)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+class BertLayer(nn.Module):
+    def __init__(self, config):
+        super(BertLayer, self).__init__()
+        self.attention = BertAttention(config)
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+
+    def forward(self, hidden_states, attention_mask=None):
+        attention_outputs = self.attention(hidden_states, attention_mask)
+        attention_output = attention_outputs[0]
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+```
+
+#### BertEncoder
+
+![BertEncoder模型结构图](图解BERT/11.png)
+
+```python
+class BertEncoder(nn.Module):
+    def __init__(self, config):
+        super(BertEncoder, self).__init__()
+        self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
+
+    def forward(self, hidden_states, attention_mask=None, head_mask=None):
+        for i, layer_module in enumerate(self.layer):
+            hidden_states = layer_module(hidden_states, attention_mask, head_mask[i])
+        return hidden_states
+```
+
+### BertPooler
+
+![BertPooler模型结构图](图解BERT/10.png)
+
+```python
+class BertPooler(nn.Module):
+    def __init__(self, config):
+        super(BertPooler, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0] # CLS Token Context Embeddings
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+```
+### BertModel
+
+
+![BertModel模型结构图](图解BERT/12.png)
+
+```python
+class BertModel(BertPreTrainedModel):
+    def __init__(self, config):
+        super(BertModel, self).__init__(config)
+        self.embeddings = BertEmbeddings(config)
+        self.encoder = BertEncoder(config)
+        self.pooler = BertPooler(config)
+        self.init_weights()
+
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
+        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+
+        embedding_output = self.embeddings(input_ids, position_ids=position_ids, token_type_ids=token_type_ids)
+        sequence_output = self.encoder(embedding_output,
+                                       extended_attention_mask, # padding mask
+                                      )
+        pooled_output = self.pooler(sequence_output)
+
+        outputs = (sequence_output, pooled_output,)
+        return outputs
+```
+
+### BertForSequenceClassification
+
+![BertForSequenceClassification模型结构图](图解BERT/13.png)
+
+```python
+class BertForSequenceClassification(BertPreTrainedModel):
+    def __init__(self, config):
+        super(BertForSequenceClassification, self).__init__(config)
+        self.num_labels = config.num_labels
+        self.bert = BertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
+
+        self.init_weights()
+
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None,
+                position_ids=None, head_mask=None, labels=None):
+
+        outputs = self.bert(input_ids,
+                            attention_mask=attention_mask, # padding mask
+                            token_type_ids=token_type_ids,
+                            position_ids=position_ids, 
+                            head_mask=head_mask) # None ?
+
+        pooled_output = outputs[1] # 对于分类任务来说，只需要去除CLS Token用于分类任务即可
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
+
+        if labels is not None:
+            if self.num_labels == 1:
+                #  We are doing regression
+                loss_fct = MSELoss()
+                loss = loss_fct(logits.view(-1), labels.view(-1))
+            else:
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            outputs = (loss,) + outputs
+
+        return outputs  # (loss), logits, (hidden_states), (attentions)
+```