tensorflow · pschuh · May 23, 2019 · May 23, 2019
diff --git a/Sources/DeepLearning/Core/Dataset.swift b/Sources/DeepLearning/Core/Dataset.swift
@@ -0,0 +1,216 @@
+//===-- Dataset.swift -----------------------------------------*- swift -*-===//
+//
+// This source file is part of the Swift.org open source project
+//
+// Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
+// Licensed under Apache License v2.0 with Runtime Library Exception
+//
+// See https://swift.org/LICENSE.txt for license information
+// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
+//
+//===----------------------------------------------------------------------===//
+//
+// The dataset API.
+//
+//===----------------------------------------------------------------------===//
+
+/// The default graph seed.
+///
+/// - Note: See TensorFlow's `python.framework.random_seed.DEFAULT_GRAPH_SEED`.
+@usableFromInline let _defaultGraphSeed: Int64 = 87654321
+
+/// Returns the local seeds an operation should use given an op-specific seed.
+///
+/// Given operation-specific seed, `seed`, this helper function returns two
+/// seeds derived from graph-level and op-level seeds. Many random operations
+/// internally use the two seeds to allow user to change the seed globally for a
+/// graph, or for only specific operations.
+///
+/// - Note: See TensorFlow's `python.framework.random_seed.get_seed`.
+///
+// TODO: There's no support for TF's "global seed" yet, so we always use the
+// default graph seed as the first seed. Need to investigate the best way to
+// model TF's "global seed".
+@usableFromInline @inline(__always)
+func _tensorSeeds(_ seed: Tensor<Int64>) -> (Tensor<Int64>, Tensor<Int64>) {
+  return (Tensor(_defaultGraphSeed), seed)
+}
+
+//===----------------------------------------------------------------------===//
+// Single value dataset
+//===----------------------------------------------------------------------===//
+
+/// Represents a potentially large set of elements.
+///
+/// A `Dataset` can be used to represent an input pipeline as a collection of
+/// element tensors.
+@_fixed_layout
+public struct Dataset<Element : TensorGroup> {
+  public let _handle: VariantHandle
+
+  @inlinable
+  public init(_handle: VariantHandle) {
+    self._handle = _handle
+  }
+}
+
+public extension Dataset {
+  @inlinable
+  init(randomSeed: Int64) {
+    let (seed1, seed2) = _tensorSeeds(Tensor(randomSeed))
+    self.init(_handle: Raw.experimentalRandomDataset(
+      seed: seed1,
+      seed2: seed2,
+      outputTypes: Element._typeList,
+      outputShapes: Element._unknownShapeList))
+  }
+}
+
+public extension Dataset {
+  /// Creates a dataset from a batch of elements as a tensor.
+  @inlinable
+  init(elements: Element) {
+    self.init(_handle: Raw.tensorSliceDataset(
+      components: [elements],
+      outputShapes: Element._unknownShapeList))
+  }
+}
+
+extension Dataset : Sequence {
+  public typealias Iterator = DatasetIterator<Element>
+
+  /// Returns an iterator over the elements of this dataset.
+  @inlinable
+  public func makeIterator() -> DatasetIterator<Element> {
+    let resource = Raw.anonymousIterator(
+      outputTypes: Element._typeList,
+      outputShapes: Element._unknownShapeList)
+    Raw.makeIterator(dataset: _handle, iterator: resource)
+    return DatasetIterator(_handle: resource)
+  }
+}
+
+public extension Dataset {
+  // Note that this Dataset API implementation uses an experimental tracing
+  // feature, which is not robust and does not have great diagnostics yet.
+  @inlinable
+  func map<ResultElement : TensorGroup>(
+    _ transform: (Element) -> ResultElement
+  ) -> Dataset<ResultElement> {
+    return Dataset<ResultElement>(_handle: Raw.mapDataset(
+      inputDataset: _handle,
+      otherArguments: Tensor<Int32>(0),
+      f: transform,
+      outputTypes: ResultElement._typeList,
+      outputShapes: ResultElement._unknownShapeList,
+      useInterOpParallelism: true,
+      preserveCardinality: false))
+  }
+
+  @inlinable
+  func map<ResultElement : TensorGroup>(
+    parallelCallCount: Int,
+    _ transform: (Element) -> ResultElement
+  ) -> Dataset<ResultElement> {
+    return Dataset<ResultElement>(_handle: Raw.parallelMapDataset(
+      inputDataset: _handle,
+      otherArguments: Tensor<Int32>(0),
+      numParallelCalls: Tensor<Int32>(Int32(parallelCallCount)),
+      f: transform,
+      outputTypes: ResultElement._typeList,
+      outputShapes: ResultElement._unknownShapeList,
+      useInterOpParallelism: true,
+      sloppy: false,
+      preserveCardinality: false))
+  }
+
+  @inlinable
+  func filter(
+    _ isIncluded: (Element) -> Tensor<Bool>
+  ) -> Dataset {
+    return Dataset(_handle: Raw.filterDataset(
+      inputDataset: _handle,
+      otherArguments: Tensor<Int32>(0),
+      predicate: isIncluded,
+      outputTypes: Element._typeList,
+      outputShapes: Element._unknownShapeList))
+  }
+}
+
+public extension Dataset {
+  @inlinable
+  func shuffled(
+    sampleCount: Int, randomSeed: Int64
+  ) -> Dataset {
+    let (seed1, seed2) = _tensorSeeds(Tensor(randomSeed))
+    return Dataset(_handle: Raw.shuffleDataset(
+      inputDataset: _handle,
+      bufferSize: Tensor(Int64(sampleCount)),
+      seed: seed1,
+      seed2: seed2,
+      outputTypes: Element._typeList,
+      outputShapes: Element._unknownShapeList))
+  }
+
+  @inlinable
+  func batched(_ batchSize: Int) -> Dataset {
+    return Dataset(_handle: Raw.batchDataset(
+      inputDataset: _handle,
+      batchSize: Tensor(Int64(batchSize)),
+      outputTypes: Element._typeList,
+      outputShapes: Element._unknownShapeList))
+  }
+}
+
+/// The type that allows iteration over a dataset's elements.
+@_fixed_layout
+public struct DatasetIterator<Element : TensorGroup> {
+  @usableFromInline let _handle: ResourceHandle
+
+  @usableFromInline
+  internal init(_handle: ResourceHandle) {
+    self._handle = _handle
+  }
+}
+
+extension DatasetIterator : IteratorProtocol {
+  /// Advances to the next element and returns it, or `nil` if no next element
+  /// exists.
+  @inlinable
+  public mutating func next() -> Element? {
+    let optional = Raw.iteratorGetNextAsOptional(
+      iterator: _handle,
+      outputTypes: Element._typeList,
+      outputShapes: Element._unknownShapeList)
+    guard Raw.optionalHasValue(optional: optional).scalarized() else {
+      return nil
+    }
+    return Raw.optionalGetValue(
+      optional: optional,
+      outputShapes: Element._unknownShapeList)
+  }
+}
+
+/// A 2-tuple-like struct that conforms to TensorGroup that represents a tuple 
+/// of 2 types conforming to TensorGroup.
+@_fixed_layout
+public struct Zip2TensorGroup<T : TensorGroup, U : TensorGroup> : TensorGroup {
+  public var first: T
+  public var second: U
+
+  public init(_ first: T, _ second: U) {
+    self.first = first
+    self.second = second
+  }
+}
+
+@inlinable
+public func zip<T : TensorGroup, U : TensorGroup>(
+  _ dataset1: Dataset<T>, _ dataset2: Dataset<U>
+) -> Dataset<Zip2TensorGroup<T, U>> {
+  let handle = Raw.zipDataset(
+    inputDatasets: [dataset1._handle, dataset2._handle],
+    outputTypes: Zip2TensorGroup<T, U>._typeList,
+    outputShapes: Zip2TensorGroup<T, U>._unknownShapeList)
+  return Dataset(_handle: handle)
+}
diff --git a/Tests/DeepLearningTests/DatasetTests.swift b/Tests/DeepLearningTests/DatasetTests.swift
@@ -0,0 +1,156 @@
+// Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+import XCTest
+import DeepLearning
+
+struct SimpleOutput : TensorGroup {
+    let a: TensorHandle<Int32>
+    let b: TensorHandle<Int32>
+}
+
+final class DatasetTests: XCTestCase {
+    func testMultiValue() {
+        let elements1: Tensor<Int32> = [0, 1, 2]
+        let elements2: Tensor<Int32> = [10, 11, 12]
+        let outputTypes = [Int32.tensorFlowDataType, Int32.tensorFlowDataType]
+        let outputShapes: [TensorShape?] = [nil, nil]
+        let dataset: VariantHandle = Raw.tensorSliceDataset(
+            components: [elements1, elements2],
+            outputShapes: outputShapes
+        )
+        let iterator: ResourceHandle = Raw.iteratorV2(sharedName: "blah",
+            container: "earth", outputTypes: outputTypes, outputShapes: outputShapes
+        )
+        Raw.makeIterator(dataset: dataset, iterator: iterator)
+        var next: SimpleOutput = Raw.iteratorGetNext(
+            iterator: iterator, outputShapes: outputShapes
+        )
+        XCTAssertEqual(0, Tensor(handle: next.a).scalarized())
+        XCTAssertEqual(10, Tensor(handle: next.b).scalarized())
+        next = Raw.iteratorGetNext(
+            iterator: iterator, outputShapes: outputShapes
+        )
+        XCTAssertEqual(1, Tensor(handle: next.a).scalarized())
+        XCTAssertEqual(11, Tensor(handle: next.b).scalarized())
+        next = Raw.iteratorGetNext(
+            iterator: iterator, outputShapes: outputShapes
+        )
+        XCTAssertEqual(2, Tensor(handle: next.a).scalarized())
+        XCTAssertEqual(12, Tensor(handle: next.b).scalarized())
+    }
+
+    func testSingleValueManualIterator() {
+      // [[1], [2], [3], [4], [5]]
+      let scalars = Tensor<Float>(rangeFrom: 0, to: 5, stride: 1)
+          .reshaped(to: [5, 1])
+      let dataset = Dataset(elements: scalars)
+      var iterator = dataset.makeIterator()
+      var i: Int = 0
+      while let item = iterator.next() {
+          XCTAssertEqual(scalars[i].array, item.array)
+          i += 1
+      }
+    }
+
+    func testDatasetIteration() {
+        // [[1], [2], [3], [4], [5]]
+        let scalars = Tensor<Float>(rangeFrom: 0, to: 5, stride: 1)
+            .reshaped(to: [5, 1])
+        let dataset = Dataset(elements: scalars)
+        var i: Int = 0
+        for item in dataset {
+            XCTAssertEqual(scalars[i].array, item.array)
+            i += 1
+        }
+    }
+
+    func testSingleValueTransformations() {
+        let scalars = Tensor<Float>(rangeFrom: 0, to: 5, stride: 1)
+        let dataset = Dataset(elements: scalars)
+        let shuffled = dataset.shuffled(sampleCount: 5, randomSeed: 42)
+        XCTAssertEqual([0, 4, 1, 3, 2], shuffled.map { $0.scalar! })
+    }
+
+    func testSingleValueHOFs() {
+        let scalars = Tensor<Float>(rangeFrom: 0, to: 5, stride: 1)
+        let dataset = Dataset(elements: scalars)
+        let addedOne: Dataset = dataset.map { $0 + 1 }
+        XCTAssertEqual([1, 2, 3, 4, 5], addedOne.flatMap { $0.scalars })
+        // Use '.==' in the following closure to avoid any conversions to
+        // host data types, which is not handled correctly in tracing.
+        let evens: Dataset = dataset.filter { Tensor($0 % 2) .== Tensor(0) }
+        XCTAssertEqual([0, 2, 4], evens.flatMap { $0.scalars })
+    }
+
+    func testParallelMap() {
+        let scalars = Tensor<Float>(rangeFrom: 0, to: 5, stride: 1)
+        let dataset = Dataset(elements: scalars)
+        let addedOne: Dataset = dataset.map(parallelCallCount: 5) { $0 + 1 }
+        XCTAssertEqual([1, 2, 3, 4, 5], addedOne.flatMap { $0.scalars })
+        // Use '.==' in the following closure to avoid any conversions to
+        // host data types, which is not handled correctly in tracing.
+        let evens: Dataset = dataset.filter { Tensor($0 % 2) .== Tensor(0) }
+        XCTAssertEqual([0, 2, 4], evens.flatMap { $0.scalars })
+    }
+
+    func testMapToDifferentType() {
+        let scalars = Tensor<Float>(rangeFrom: 0, to: 5, stride: 1)
+        let dataset = Dataset(elements: scalars)
+        let shuffled = dataset.shuffled(sampleCount: 5, randomSeed: 42)
+        XCTAssertEqual([0, 4, 1, 3, 2], shuffled.map { $0.scalar! })
+        let evens = shuffled.map { Tensor($0 % 2) .== Tensor(0) }
+        XCTAssertEqual([true, true, false, false, true], evens.map { $0.scalar! })
+    }
+
+    func testSingleValueBatched() {
+        let scalars = Tensor<Float>(rangeFrom: 0, to: 5, stride: 1)
+        let dataset = Dataset(elements: scalars)
+        let batched = dataset.batched(2)
+
+        var iterator = batched.makeIterator()
+        XCTAssertEqual([0, 1], iterator.next()!.scalars)
+        XCTAssertEqual([2, 3], iterator.next()!.scalars)
+        XCTAssertEqual([4], iterator.next()!.scalars)
+    }
+
+/*
+    func testDoubleValueDatasetIteration() {
+        let scalars1 = Tensor<Float>(rangeFrom: 0, to: 5, stride: 1)
+        let scalars2 = Tensor<Int32>(rangeFrom: 5, to: 10, stride: 1)
+        let datasetLeft = Dataset(elements: scalars1)
+        let datasetRight = Dataset(elements: scalars2)
+        var i: Int = 0
+        for pair in zip(datasetLeft, datasetRight) {
+            XCTAssertEqual(scalars1[i].array, pair.first.array)
+            XCTAssertEqual(scalars2[i].array, pair.second.array)
+            i += 1
+        }
+    }
+*/
+
+    static var allTests = [
+        ("testMultiValue", testMultiValue),
+        ("testSingleValueManualIterator", testSingleValueManualIterator),
+        ("testDatasetIteration", testDatasetIteration),
+        ("testSingleValueTransformations", testSingleValueTransformations),
+        ("testSingleValueHOFs", testSingleValueHOFs),
+        ("testParallelMap", testParallelMap),
+        ("testMapToDifferentType", testMapToDifferentType),
+        ("testSingleValueBatched", testSingleValueBatched),
+        // Currently broken even in TensorFlow ...
+        // This will be easier to fix once everything is moved ...
+        // ("testDoubleValueDatasetIteration", testDoubleValueDatasetIteration),
+    ]
+}
diff --git a/Tests/DeepLearningTests/XCTestManifests.swift b/Tests/DeepLearningTests/XCTestManifests.swift
@@ -23,6 +23,7 @@ public func allTests() -> [XCTestCaseEntry] {
         testCase(SequentialTests.allTests),
         testCase(LayerTests.allTests),
         testCase(TensorTests.allTests),
+        testCase(DatasetTests.allTests),
     ]
 }
 #endif