aws
diff --git a/‎sagemaker-debugger/model_specific_realtime_analysis/bert_attention_head_view/bert_attention_head_view.ipynb
Lines changed: 582 additions & 0 deletions b/‎sagemaker-debugger/model_specific_realtime_analysis/bert_attention_head_view/bert_attention_head_view.ipynb
Lines changed: 582 additions & 0 deletions
diff --git a/‎sagemaker-debugger/model_specific_realtime_analysis/bert_attention_head_view/entry_point/data.py
Lines changed: 527 additions & 0 deletions b/‎sagemaker-debugger/model_specific_realtime_analysis/bert_attention_head_view/entry_point/data.py
Lines changed: 527 additions & 0 deletions
diff --git a/‎sagemaker-debugger/model_specific_realtime_analysis/bert_attention_head_view/entry_point/model.py
Lines changed: 115 additions & 0 deletions b/‎sagemaker-debugger/model_specific_realtime_analysis/bert_attention_head_view/entry_point/model.py
Lines changed: 115 additions & 0 deletions
diff --git a/‎sagemaker-debugger/model_specific_realtime_analysis/bert_attention_head_view/entry_point/requirements.txt
Lines changed: 1 addition & 0 deletions b/‎sagemaker-debugger/model_specific_realtime_analysis/bert_attention_head_view/entry_point/requirements.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎sagemaker-debugger/model_specific_realtime_analysis/bert_attention_head_view/entry_point/train.py
Lines changed: 180 additions & 0 deletions b/‎sagemaker-debugger/model_specific_realtime_analysis/bert_attention_head_view/entry_point/train.py
Lines changed: 180 additions & 0 deletions
diff --git a/‎sagemaker-debugger/model_specific_realtime_analysis/bert_attention_head_view/images/attention_scores.gif
2.05 MB b/‎sagemaker-debugger/model_specific_realtime_analysis/bert_attention_head_view/images/attention_scores.gif
2.05 MB
diff --git a/‎sagemaker-debugger/model_specific_realtime_analysis/bert_attention_head_view/utils/attention_head_view.py
Lines changed: 92 additions & 0 deletions b/‎sagemaker-debugger/model_specific_realtime_analysis/bert_attention_head_view/utils/attention_head_view.py
Lines changed: 92 additions & 0 deletions
@@ -0,0 +1,115 @@
+# coding: utf-8
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""BertForQA models."""
+
+__all__ = ['BertForQA', 'BertForQALoss']
+
+from mxnet.gluon import HybridBlock, loss, nn
+from mxnet.gluon.loss import Loss
+
+
+class BertForQA(HybridBlock):
+    """Model for SQuAD task with BERT.
+
+    The model feeds token ids and token type ids into BERT to get the
+    pooled BERT sequence representation, then apply a Dense layer for QA task.
+
+    Parameters
+    ----------
+    bert: BERTModel
+        Bidirectional encoder with transformer.
+    prefix : str or None
+        See document of `mx.gluon.Block`.
+    params : ParameterDict or None
+        See document of `mx.gluon.Block`.
+    """
+
+    def __init__(self, bert, prefix=None, params=None):
+        super(BertForQA, self).__init__(prefix=prefix, params=params)
+        self.bert = bert
+        with self.name_scope():
+            self.span_classifier = nn.Dense(units=2, flatten=False)
+
+    def __call__(self, inputs, token_types, valid_length=None):
+        #pylint: disable=arguments-differ, dangerous-default-value
+        """Generate the unnormalized score for the given the input sequences."""
+        # XXX Temporary hack for hybridization as hybridblock does not support None inputs
+        valid_length = [] if valid_length is None else valid_length
+        return super(BertForQA, self).__call__(inputs, token_types, valid_length)
+
+    def hybrid_forward(self, F, inputs, token_types, valid_length=None):
+        # pylint: disable=arguments-differ
+        """Generate the unnormalized score for the given the input sequences.
+
+        Parameters
+        ----------
+        inputs : NDArray, shape (batch_size, seq_length)
+            Input words for the sequences.
+        token_types : NDArray, shape (batch_size, seq_length)
+            Token types for the sequences, used to indicate whether the word belongs to the
+            first sentence or the second one.
+        valid_length : NDArray or None, shape (batch_size,)
+            Valid length of the sequence. This is used to mask the padded tokens.
+
+        Returns
+        -------
+        outputs : NDArray
+            Shape (batch_size, seq_length, 2)
+        """
+        # XXX Temporary hack for hybridization as hybridblock does not support None inputs
+        if isinstance(valid_length, list) and len(valid_length) == 0:
+            valid_length = None
+        bert_output = self.bert(inputs, token_types, valid_length)[0]
+
+        output = self.span_classifier(bert_output)
+        return output
+
+
+class BertForQALoss(Loss):
+    """Loss for SQuAD task with BERT.
+
+    """
+
+    def __init__(self, weight=None, batch_axis=0, **kwargs):  # pylint: disable=unused-argument
+        super(BertForQALoss, self).__init__(
+            weight=None, batch_axis=0, **kwargs)
+        self.loss = loss.SoftmaxCELoss()
+
+    def hybrid_forward(self, F, pred, label):  # pylint: disable=arguments-differ
+        """
+        Parameters
+        ----------
+        pred : NDArray, shape (batch_size, seq_length, 2)
+            BERTSquad forward output.
+        label : list, length is 2, each shape is (batch_size,1)
+            label[0] is the starting position of the answer,
+            label[1] is the ending position of the answer.
+
+        Returns
+        -------
+        outputs : NDArray
+            Shape (batch_size,)
+        """
+        pred = F.split(pred, axis=2, num_outputs=2)
+        start_pred = pred[0].reshape((0, -3))
+        start_label = label[0]
+        end_pred = pred[1].reshape((0, -3))
+        end_label = label[1]
+        return (self.loss(start_pred, start_label) + self.loss(
+            end_pred, end_label)) / 2
@@ -0,0 +1 @@
+gluonnlp
@@ -0,0 +1,180 @@
+import argparse
+import time
+import numpy as np
+import mxnet as mx
+
+import gluonnlp as nlp
+from gluonnlp.data import SQuAD
+from model import BertForQALoss, BertForQA
+from data import SQuADTransform, preprocess_dataset
+
+import smdebug.mxnet as smd
+from smdebug import modes
+
+def get_dataloaders(batch_size, vocab, train_dataset_size, val_dataset_size):
+    
+    batchify_fn = nlp.data.batchify.Tuple(
+    nlp.data.batchify.Stack(),
+    nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token]),
+    nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token]),
+    nlp.data.batchify.Stack('float32'),
+    nlp.data.batchify.Stack('float32'),
+    nlp.data.batchify.Stack(),
+    )
+
+    train_data = SQuAD("train", version='2.0')[:train_dataset_size]
+
+    train_data_transform, _ = preprocess_dataset(
+        train_data, SQuADTransform(
+            nlp.data.BERTTokenizer(vocab=vocab, lower=True),
+            max_seq_length=384,
+            doc_stride=128,
+            max_query_length=64,
+            is_pad=True,
+            is_training=True))
+
+    train_dataloader = mx.gluon.data.DataLoader(
+        train_data_transform, batchify_fn=batchify_fn,
+        batch_size=batch_size, num_workers=4, shuffle=True)
+
+    #we only get 4 validation samples
+    dev_data = SQuAD("dev", version='2.0')[:val_dataset_size]
+    dev_data = mx.gluon.data.SimpleDataset(dev_data)
+    
+    dev_dataset = dev_data.transform(
+        SQuADTransform(
+            nlp.data.BERTTokenizer(vocab=vocab, lower=True),
+            max_seq_length=384,
+            doc_stride=128,
+            max_query_length=64,
+            is_pad=False,
+            is_training=False)._transform, lazy=False)
+    
+    dev_data_transform, _ = preprocess_dataset(
+        dev_data, SQuADTransform(
+            nlp.data.BERTTokenizer(vocab=vocab, lower=True),
+            max_seq_length=384,
+            doc_stride=128,
+            max_query_length=64,
+            is_pad=False,
+            is_training=False))
+    
+    dev_dataloader = mx.gluon.data.DataLoader(
+        dev_data_transform,
+        batchify_fn=batchify_fn,
+        num_workers=1, batch_size=batch_size,
+        shuffle=False, last_batch='keep')
+    
+    return train_dataloader, dev_dataloader, dev_dataset
+
+def train_model(epochs, batch_size, learning_rate, train_dataset_size, val_dataset_size):
+    
+    #Check if GPU available
+    ctx = mx.gpu() 
+
+    #load petrained BERT model weights (trained on wiki dataset)
+    bert, vocab = nlp.model.get_model(
+        name='bert_12_768_12',
+        dataset_name='book_corpus_wiki_en_uncased',
+        vocab=None,
+        pretrained='true',
+        ctx=ctx,
+        use_pooler=False,
+        use_decoder=False,
+        use_classifier=False,
+        output_attention=True)
+
+    #create BERT model for Question Answering
+    net = BertForQA(bert=bert)
+    net.span_classifier.initialize(init=mx.init.Normal(0.02), ctx=ctx)
+
+    #create smdebug hook
+    hook = smd.Hook.create_from_json_file() 
+    
+    hook.register_block(net)
+    
+    #loss function for BERT model training
+    loss_function = BertForQALoss()
+    
+    #trainer
+    trainer = mx.gluon.Trainer(net.collect_params(), 
+                               'bertadam',
+                               {'learning_rate': learning_rate}, 
+                               update_on_kvstore=False)
+
+    #create dataloader
+    train_dataloader, dev_dataloader, dev_dataset = get_dataloaders(batch_size, vocab, train_dataset_size, val_dataset_size)
+
+    #initialize model parameters
+    for _, v in net.collect_params('.*beta|.*gamma|.*bias').items():
+        v.wd_mult = 0.0
+  
+    params = [p for p in net.collect_params().values()
+              if p.grad_req != 'null']
+
+    #start training loop
+    for epoch_id in range(epochs):
+        
+        for batch_id, data in enumerate(train_dataloader):
+            hook.set_mode(modes.TRAIN)
+            with mx.autograd.record():
+                _, inputs, token_types, valid_length, start_label, end_label = data
+
+                # forward pass
+                out = net(inputs.astype('float32').as_in_context(ctx),
+                          token_types.astype('float32').as_in_context(ctx),
+                          valid_length.astype('float32').as_in_context(ctx))
+
+                #compute loss
+                ls = loss_function(out, [
+                    start_label.astype('float32').as_in_context(ctx),
+                    end_label.astype('float32').as_in_context(ctx)]).mean()
+
+            #backpropagation
+            ls.backward()
+            nlp.utils.clip_grad_global_norm(params, 1)
+            
+            #update model parameters
+            trainer.update(1)
+                
+        #validation loop
+        hook.set_mode(modes.EVAL)
+        for data in dev_dataloader:
+
+            example_ids, inputs, token_types, valid_length, _, _ = data
+
+            #forward pass
+            out = net(inputs.astype('float32').as_in_context(ctx),
+                      token_types.astype('float32').as_in_context(ctx),
+                      valid_length.astype('float32').as_in_context(ctx))
+
+            #record input tokens
+            input_tokens = np.array([])
+            for example_id in example_ids.asnumpy().tolist():
+                array = np.array(dev_dataset[example_id][0].tokens, dtype=np.str)
+                array = array.reshape(1, array.shape[0])
+                input_tokens = np.append(input_tokens, array)
+
+            if hook.get_collections()['all'].save_config.should_save_step(modes.EVAL, hook.mode_steps[modes.EVAL]):  
+                hook._write_raw_tensor_simple("input_tokens", input_tokens)
+            
+
+            
+if __name__ =='__main__':
+
+    parser = argparse.ArgumentParser()
+
+    # hyperparameters sent by the client are passed as command-line arguments to the script.
+    parser.add_argument('--epochs', type=int, default=20)
+    parser.add_argument('--batch_size', type=int, default=64)  
+    parser.add_argument('--learning_rate', type=float, default=0.001)
+    parser.add_argument('--val_dataset_size', type=int, default=64) 
+    parser.add_argument('--train_dataset_size', type=int, default=1024)  
+    parser.add_argument('--smdebug_dir', type=str, default=None)
+
+    #parse arguments
+    args, _ = parser.parse_known_args()
+    
+    #train model
+    model = train_model(epochs=args.epochs, batch_size=args.batch_size, learning_rate=args.learning_rate, train_dataset_size=args.train_dataset_size, val_dataset_size=args.val_dataset_size)
+
@@ -0,0 +1,92 @@
+from bokeh.plotting import show, figure
+from bokeh.models.annotations import Title
+from bokeh.models import  ColumnDataSource, Label, Range1d
+from bokeh.io import  show, output_notebook, push_notebook
+from bokeh.models.glyphs import Line
+import numpy as np
+
+output_notebook()
+
+class AttentionHeadView():
+    def __init__(self, 
+                 input_tokens=None, 
+                 tensors=None, 
+                 layer='bertencoder0_transformer0_multiheadattentioncell0_output_1', 
+                 step=0, 
+                 n_tokens=20):
+        self.head = 0
+        self.step = step
+        self.input_tokens = input_tokens[:n_tokens]
+        self.n_tokens = n_tokens
+        self.tensors = tensors
+        self.p = None
+        self.layer = layer
+        self.sources = []
+        self.create()
+        
+    def update(self):
+        
+        tensor =  self.tensors[self.layer][self.step][0, self.head, :, :]
+        
+        counter = 0
+        for x in range(self.n_tokens):
+            for y in range(self.n_tokens):
+                source = self.sources[counter]
+                source.line_width = tensor[x, y] * 2
+                counter += 1
+               
+    def select_layer(self, layer):
+        self.layer = layer
+        self.update()
+        push_notebook()
+    
+    def select_head(self, head):
+        self.head = head
+        self.update()
+        push_notebook()
+        
+    def select_step(self, step):
+        self.step = step
+        self.update()
+        push_notebook()
+        
+    def create(self):
+
+        # set size of figure
+        self.p = figure(width = 450, 
+                   plot_height = 50 * self.n_tokens, 
+                   x_range=Range1d(0, self.n_tokens + 2), 
+                   y_range=Range1d(0, self.n_tokens))
+        
+        self.p.xgrid.visible = False
+        self.p.ygrid.visible = False
+        self.p.axis.visible = False
+        
+        x = np.zeros(self.n_tokens) + 2 
+        y = np.flip(np.arange(0, self.n_tokens), axis=0)
+        
+        # set input tokens in plot
+        for token, x_i, y_i in zip(self.input_tokens, x, y):
+            text1 = Label(x = x_i - 1, 
+                          y = y_i, 
+                          text = token, 
+                          text_font_size = '10pt')
+            text2 = Label(x = x_i + 10, 
+                          y = y_i, 
+                          text = token, 
+                          text_font_size = '10pt')
+            self.p.add_layout(text2)
+            self.p.add_layout(text1)
+    
+        tensor =  self.tensors[self.layer][self.step][0, self.head, :, :]
+
+        #plot attention weights
+        for x in range(self.n_tokens):
+            for y in range(self.n_tokens):
+                source = ColumnDataSource(data=dict(x=[2, 12], 
+                                                    y=[self.n_tokens - x - 1, self.n_tokens - y - 1]))
+                line = Line(x="x", y="y", line_width=tensor[x, y], line_color = "blue")
+                self.p.add_glyph(source, line)
+                self.sources.append(line)
+
+        show(self.p, notebook_handle=True)