Skip to content
This repository was archived by the owner on Apr 23, 2025. It is now read-only.

[WIP] Added support for BERT. #231

Merged
merged 29 commits into from
Feb 14, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
0beb166
Added initial support for BERT.
eaplatanios Nov 26, 2019
e0bec4d
Renamed 'LayerNormalization' to 'LayerNorm'.
eaplatanios Nov 26, 2019
0c3e803
Added a 'TextModels' SwiftPM target.
eaplatanios Nov 26, 2019
3cbd9e7
Fixed some of the compilation errors.
eaplatanios Nov 26, 2019
cba579d
Added 'Optimizer' protocol.
eaplatanios Nov 26, 2019
ac9f7c8
Removed 'truncatedNormalInitializer'.
eaplatanios Nov 26, 2019
e36eeb5
Added initial support for BERT.
eaplatanios Nov 26, 2019
6103829
Renamed 'LayerNormalization' to 'LayerNorm'.
eaplatanios Nov 26, 2019
f817dac
Added a 'TextModels' SwiftPM target.
eaplatanios Nov 26, 2019
f8ea3d2
Fixed some of the compilation errors.
eaplatanios Nov 26, 2019
075ebe1
Added 'Optimizer' protocol.
eaplatanios Nov 26, 2019
e5af5cc
Removed 'truncatedNormalInitializer'.
eaplatanios Nov 26, 2019
9a0d6f5
Minor cleanup.
dan-zheng Jan 28, 2020
63219ee
Change `@differentiable` function default arguments from closures to …
dan-zheng Jan 29, 2020
ed7738f
Fix non-differentiability error using `withoutDerivative(at:)`.
dan-zheng Jan 29, 2020
509c89c
Add code for CoLA task.
dan-zheng Jan 29, 2020
e674ade
Add working main function.
dan-zheng Jan 30, 2020
6924f43
Tune learning rate schedule, add gradient clipping.
dan-zheng Jan 30, 2020
9a00c04
Made some minor edits to get the BERT classifier training to work for…
eaplatanios Jan 30, 2020
6761958
Rename "epoch" to "step" in training loop.
dan-zheng Jan 30, 2020
a54c9f7
Add CoLA evaluation.
dan-zheng Jan 30, 2020
5474bcf
Fix BERT training.
dan-zheng Jan 30, 2020
3a210ad
Make training loop an infinite loop.
dan-zheng Jan 31, 2020
c210167
Fixed BERT. (#294)
eaplatanios Jan 31, 2020
daa720c
Minor edits.
dan-zheng Jan 31, 2020
b7bcaa4
Temporarily disabled bucketing.
eaplatanios Jan 31, 2020
5b50b48
Merge branch 'master' of github.com:tensorflow/swift-models into bert
dan-zheng Feb 12, 2020
9a73fbe
Merge branch 'bert-wip' of github.com:tensorflow/swift-models into bert
dan-zheng Feb 13, 2020
ce25ee9
Delete extraneous file.
dan-zheng Feb 14, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions Datasets/DatasetUtilities.swift
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ public enum DatasetUtilities {
public static let currentWorkingDirectoryURL = URL(
fileURLWithPath: FileManager.default.currentDirectoryPath)

@discardableResult
public static func downloadResource(
filename: String,
fileExtension: String,
Expand All @@ -49,6 +50,7 @@ public enum DatasetUtilities {
return localURL
}

@discardableResult
public static func fetchResource(
filename: String,
fileExtension: String,
Expand Down Expand Up @@ -121,6 +123,9 @@ public enum DatasetUtilities {
case "tar.gz", "tgz":
toolName = "tar"
arguments = ["xzf", archivePath, "-C", resource.localStorageDirectory.path]
case "zip":
toolName = "unzip"
arguments = [archivePath, "-d", resource.localStorageDirectory.path]
default:
printError("Unable to find archiver for extension \(resource.fileExtension).")
exit(-1)
Expand Down
225 changes: 225 additions & 0 deletions Models/Text/Attention.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
// Copyright 2019 The TensorFlow Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

import TensorFlow

/// Input to an attention layer.
public struct AttentionInput<Scalar: TensorFlowFloatingPoint>: Differentiable {
/// Source tensor that we are attending from, with shape
/// `[batchSize, sourceSequenceLength, sourceDepth]` or
/// `[batchSize, sourceSequenceLength * sourceDepth]`.
public var source: Tensor<Scalar>

/// Target tensor that we are attending to, with shape
/// `[batchSize, targetSequenceLength, targetDepth]` or
/// `[batchSize, targetSequenceLength * targetDepth]`.
public var target: Tensor<Scalar>

/// Mask to apply on the attention scores. This is a tensor with shape
/// `[batchSize, sourceSequenceLength, targetSequenceLength]` or
/// `[batchSize, sourceSequenceLength * targetSequenceLength]`. The values should be `1` or `0`.
/// The attention scores will effectively be set to negative infinity for any positions in the
/// mask that are set to `0`, and will be unchanged for positions that are set to `1`.
public var mask: Tensor<Scalar>

/// The batch size of this input. This is optional because it is only needed if the input
/// sequences have been reshaped to matrices.
@noDerivative let batchSize: Int?

@differentiable
public init(
source: Tensor<Scalar>,
target: Tensor<Scalar>,
mask: Tensor<Scalar>,
batchSize: Int? = nil
) {
precondition(
source.rank == target.rank,
"The rank of the attention source and target tensors must match.")
self.source = source
self.target = target
self.mask = mask
self.batchSize = batchSize
}
}

/// Multi-head attention layer.
///
/// This implementation is based on the
/// ["Attention Is All You Need"](https://arxiv.org/abs/1706.03762) paper. If the source and target
/// tensors are the same, then this layer behaves as a self-attention layer. Each sequence step in
/// the source tensor attends to the corresponding sequence in the target tensor and returns a
/// fixed-size vector.
///
/// This function first projects the source tensor into a "query" tensor and the target tensor into
/// "key" and "value" tensors. These are (effectively) a list of tensors of length `headCount`,
/// where each tensor has shape `[batchSize, sequenceLength, headSize]`. It then performs a dot
/// product between the query and they key tensors and scales them. Finally, they are passed
/// through the softmax function to obtain attention probabilities. The value tensors are then
/// interpolated by these probabilities, and then concatenated back to a single result tensor.
///
/// In practice, the multi-head attention is implemented using transpose and reshape operations,
/// rather than using separate tensors.
///
/// - Source: ["Attention Is All You Need"](https://arxiv.org/abs/1706.03762).
public struct MultiHeadAttention: Layer, Regularizable {
// TODO: Convert to a generic constraint once TF-427 is resolved.
public typealias Scalar = Float

@noDerivative public let sourceSize: Int
@noDerivative public let targetSize: Int
@noDerivative public let headCount: Int
@noDerivative public let headSize: Int
@noDerivative public let queryActivation: Activation<Scalar>
@noDerivative public let keyActivation: Activation<Scalar>
@noDerivative public let valueActivation: Activation<Scalar>
@noDerivative public let matrixResult: Bool

public var queryWeight: Tensor<Scalar>
public var queryBias: Tensor<Scalar>
public var keyWeight: Tensor<Scalar>
public var keyBias: Tensor<Scalar>
public var valueWeight: Tensor<Scalar>
public var valueBias: Tensor<Scalar>
@noDerivative public var attentionDropout: Dropout<Scalar>

public var regularizationValue: TangentVector {
TangentVector(
queryWeight: queryWeight,
queryBias: Tensor(Scalar(0)),
keyWeight: keyWeight,
keyBias: Tensor(Scalar(0)),
valueWeight: valueWeight,
valueBias: Tensor(Scalar(0)))
}

/// Creates a multi-head attention layer.
///
/// - Parameters:
/// - sourceSize: Size/depth of the source tensor this layer is attending from.
/// - targetSize: Size/depth of the target tensor this layer is attending to.
/// - headCount: Number of attention heads.
/// - headSize: Size/depth of each attention head.
/// - queryActivation: Activation function applied to the attention query tensor.
/// - keyActivation: Activation function applied to the attention key tensor.
/// - valueActivation: Activation function applied to the attention value tensor.
/// - attentionDropoutProbability: Dropout probability for the attention scores.
/// - matrixResult: If `true`, the resulting tensor will have shape
/// `[batchSize * sourceSequenceLength, headCount * headSize]`. Otherwise, it will have shape
/// `[batchSize, sourceSequenceLength, headCount * headSize]`.
/// - queryWeightInitializer: Initializer for the query transformation weight.
/// - queryBiasInitializer: Initializer for the query transformation bias.
/// - keyWeightInitializer: Initializer for the key transformation weight.
/// - keyBiasInitializer: Initializer for the key transformation bias.
/// - valueWeightInitializer: Initializer for the value transformation weight.
/// - valueBiasInitializer: Initializer for the value transformation bias.
public init(
sourceSize: Int,
targetSize: Int,
headCount: Int = 1,
headSize: Int = 512,
queryActivation: @escaping Activation<Scalar> = identity,
keyActivation: @escaping Activation<Scalar> = identity,
valueActivation: @escaping Activation<Scalar> = identity,
attentionDropoutProbability: Scalar = 0,
matrixResult: Bool = false,
queryWeightInitializer: ParameterInitializer<Scalar> = defaultWeightInitializer,
queryBiasInitializer: ParameterInitializer<Scalar> = defaultBiasInitializer,
keyWeightInitializer: ParameterInitializer<Scalar> = defaultWeightInitializer,
keyBiasInitializer: ParameterInitializer<Scalar> = defaultBiasInitializer,
valueWeightInitializer: ParameterInitializer<Scalar> = defaultWeightInitializer,
valueBiasInitializer: ParameterInitializer<Scalar> = defaultBiasInitializer
) {
self.sourceSize = sourceSize
self.targetSize = targetSize
self.headCount = headCount
self.headSize = headSize
self.queryActivation = queryActivation
self.keyActivation = keyActivation
self.valueActivation = valueActivation
self.matrixResult = matrixResult
self.queryWeight = queryWeightInitializer([sourceSize, headCount * headSize])
self.queryBias = queryBiasInitializer([headCount * headSize])
self.keyWeight = keyWeightInitializer([targetSize, headCount * headSize])
self.keyBias = keyBiasInitializer([headCount * headSize])
self.valueWeight = valueWeightInitializer([targetSize, headCount * headSize])
self.valueBias = valueBiasInitializer([headCount * headSize])
// TODO: Make dropout generic over the probability type.
self.attentionDropout = Dropout(probability: Double(attentionDropoutProbability))
}

@differentiable
public func callAsFunction(_ input: AttentionInput<Scalar>) -> Tensor<Scalar> {
precondition(
input.source.rank == 3 || input.batchSize != nil,
"Whenever the input is provided in matrix form, the batch size must also be provided.")
// Scalar dimensions referenced here:
// - B = batch size (number of sequences)
// - F = `input.source` sequence length
// - T = `input.target` sequence length
// - N = number of attention heads
// - H = size per attention head
let matrixInput = input.source.rank < 3
let B = matrixInput ? input.batchSize! : input.source.shape[0]
let F = matrixInput ? input.source.shape[0] / B : input.source.shape[1]
let T = matrixInput ? input.target.shape[0] / B : input.target.shape[1]
let N = headCount
let H = headSize

let source = input.source.reshapedToMatrix()
let target = input.target.reshapedToMatrix()

var q = queryActivation(matmul(source, queryWeight) + queryBias) // [B * F, N * H]
var k = keyActivation(matmul(target, keyWeight) + keyBias) // [B * T, N * H]
var v = valueActivation(matmul(target, valueWeight) + valueBias) // [B * T, N * H]

q = q.reshaped(to: [B, F, N, H]).transposed(permutation: 0, 2, 1, 3) // [B, N, F, H]
k = k.reshaped(to: [B, T, N, H]).transposed(permutation: 0, 2, 1, 3) // [B, N, T, H]
v = v.reshaped(to: [B, T, N, H]).transposed(permutation: 0, 2, 1, 3) // [B, N, T, H]

// Take the dot product between the query and the key to get the raw attention scores.
var attentionScores = matmul(q, transposed: false, k, transposed: true) // [B, N, F, T]
attentionScores = attentionScores / sqrt(Scalar(headSize))

// Since the attention mask is set to 1.0 for positions we want to attend to and 0.0 for
// masked positions, we create a tensor which is 0.0 for positions we want to attend to and
// -10000.0 for masked positions. Since we are adding this tensor to the raw scores before
// the softmax, this is effectively the same as removing the masked entries entirely.
let attentionMask = input.mask.expandingShape(at: 1) // [B, 1, F, T]
attentionScores = attentionScores - 10000 * (1 - attentionMask)

// Normalize the attention scores to convert them to probabilities. We are also dropping
// out entire tokens to attend to, which might seem a bit unusual, but it is taken from the
// original Transformer paper.
let attentionProbabilities = attentionDropout(softmax(attentionScores)) // [B, N, F, T]

let result = matmul(attentionProbabilities, v) // [B, N, F, H]
.transposed(permutation: 0, 2, 1, 3) // [B, F, N, H]
return matrixResult ?
result.reshaped(to: [B * F, N * H]) :
result.reshaped(to: [B, F, N * H])
}
}

extension MultiHeadAttention {
/// Default initializer to use for the linear transform weights.
public static var defaultWeightInitializer: ParameterInitializer<Scalar> {
truncatedNormalInitializer(standardDeviation: Tensor<Scalar>(0.02))
}

/// Default initializer to use for the linear transform biases.
public static var defaultBiasInitializer: ParameterInitializer<Scalar> {
zeros()
}
}
Loading