Skip to content
This repository was archived by the owner on Apr 23, 2025. It is now read-only.

Use Epochs to load CIFAR10 #495

Merged
merged 1 commit into from
May 7, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Benchmarks/Models/ResNetCIFAR10.swift
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,11 @@ enum ResNetCIFAR10: BenchmarkModel {
}

static func makeInferenceBenchmark(settings: BenchmarkSettings) -> Benchmark {
return ImageClassificationInference<ResNet56, CIFAR10>(settings: settings)
return ImageClassificationInference<ResNet56, OldCIFAR10>(settings: settings)
}

static func makeTrainingBenchmark(settings: BenchmarkSettings) -> Benchmark {
return ImageClassificationTraining<ResNet56, CIFAR10>(settings: settings)
return ImageClassificationTraining<ResNet56, OldCIFAR10>(settings: settings)
}
}

Expand Down
228 changes: 125 additions & 103 deletions Datasets/CIFAR10/CIFAR10.swift
Original file line number Diff line number Diff line change
Expand Up @@ -22,120 +22,142 @@ import ModelSupport
import TensorFlow
import Batcher

public struct CIFAR10: ImageClassificationDataset {
public typealias SourceDataSet = [TensorPair<Float, Int32>]
public let training: Batcher<SourceDataSet>
public let test: Batcher<SourceDataSet>

public init(batchSize: Int) {
self.init(
batchSize: batchSize,
remoteBinaryArchiveLocation: URL(
string: "https://storage.googleapis.com/s4tf-hosted-binaries/datasets/CIFAR10/cifar-10-binary.tar.gz")!,
normalizing: true)
public struct CIFAR10<Entropy: RandomNumberGenerator> {
/// Type of the collection of non-collated batches.
public typealias Batches = Slices<Sampling<[(data: [UInt8], label: Int32)], ArraySlice<Int>>>
/// The type of the training data, represented as a sequence of epochs, which
/// are collection of batches.
public typealias Training = LazyMapSequence<
TrainingEpochs<[(data: [UInt8], label: Int32)], Entropy>,
LazyMapSequence<Batches, LabeledImage>
>
/// The type of the validation data, represented as a collection of batches.
public typealias Validation = LazyMapSequence<Slices<[(data: [UInt8], label: Int32)]>, LabeledImage>
/// The training epochs.
public let training: Training
/// The validation batches.
public let validation: Validation

/// Creates an instance with `batchSize`.
///
/// - Parameter entropy: a source of randomness used to shuffle sample
/// ordering. It will be stored in `self`, so if it is only pseudorandom
/// and has value semantics, the sequence of epochs is deterministic and not
/// dependent on other operations.
public init(batchSize: Int, entropy: Entropy) {
self.init(
batchSize: batchSize,
entropy: entropy,
remoteBinaryArchiveLocation: URL(
string: "https://storage.googleapis.com/s4tf-hosted-binaries/datasets/CIFAR10/cifar-10-binary.tar.gz")!,
normalizing: true)
}

/// Creates an instance with `batchSize` using `remoteBinaryArchiveLocation`.
///
/// - Parameters:
/// - entropy: a source of randomness used to shuffle sample ordering. It
/// will be stored in `self`, so if it is only pseudorandom and has value
/// semantics, the sequence of epochs is deterministic and not dependent
/// on other operations.
/// - normalizing: normalizes the batches with the mean and standard deviation
/// of the dataset iff `true`. Default value is `true`.
public init(
batchSize: Int,
entropy: Entropy,
remoteBinaryArchiveLocation: URL,
localStorageDirectory: URL = DatasetUtilities.defaultDirectory
.appendingPathComponent("CIFAR10", isDirectory: true),
normalizing: Bool
){
downloadCIFAR10IfNotPresent(from: remoteBinaryArchiveLocation, to: localStorageDirectory)

// Training data
let trainingSamples = loadCIFARTrainingFiles(in: localStorageDirectory)
training = TrainingEpochs(samples: trainingSamples, batchSize: batchSize, entropy: entropy)
.lazy.map { (batches: Batches) -> LazyMapSequence<Batches, LabeledImage> in
return batches.lazy.map{ makeBatch(samples: $0, normalizing: normalizing) }
}

// Validation data
let validationSamples = loadCIFARTestFile(in: localStorageDirectory)
validation = validationSamples.inBatches(of: batchSize).lazy.map {
makeBatch(samples: $0, normalizing: normalizing)
}
}
}

public init(
batchSize: Int,
remoteBinaryArchiveLocation: URL,
localStorageDirectory: URL = DatasetUtilities.defaultDirectory
.appendingPathComponent("CIFAR10", isDirectory: true),
normalizing: Bool)
{
downloadCIFAR10IfNotPresent(from: remoteBinaryArchiveLocation, to: localStorageDirectory)
self.training = Batcher(
on: loadCIFARTrainingFiles(localStorageDirectory: localStorageDirectory, normalizing: normalizing),
batchSize: batchSize,
numWorkers: 1, //No need to use parallelism since everything is loaded in memory
shuffle: true)
self.test = Batcher(
on: loadCIFARTestFile(localStorageDirectory: localStorageDirectory, normalizing: normalizing),
batchSize: batchSize,
numWorkers: 1) //No need to use parallelism since everything is loaded in memory
}
extension CIFAR10: ImageClassificationData where Entropy == SystemRandomNumberGenerator {
/// Creates an instance with `batchSize`.
public init(batchSize: Int) {
self.init(batchSize: batchSize, entropy: SystemRandomNumberGenerator())
}
}

func downloadCIFAR10IfNotPresent(from location: URL, to directory: URL) {
let downloadPath = directory.appendingPathComponent("cifar-10-batches-bin").path
let directoryExists = FileManager.default.fileExists(atPath: downloadPath)
let contentsOfDir = try? FileManager.default.contentsOfDirectory(atPath: downloadPath)
let directoryEmpty = (contentsOfDir == nil) || (contentsOfDir!.isEmpty)
let downloadPath = directory.appendingPathComponent("cifar-10-batches-bin").path
let directoryExists = FileManager.default.fileExists(atPath: downloadPath)
let contentsOfDir = try? FileManager.default.contentsOfDirectory(atPath: downloadPath)
let directoryEmpty = (contentsOfDir == nil) || (contentsOfDir!.isEmpty)

guard !directoryExists || directoryEmpty else { return }
guard !directoryExists || directoryEmpty else { return }

let _ = DatasetUtilities.downloadResource(
filename: "cifar-10-binary", fileExtension: "tar.gz",
remoteRoot: location.deletingLastPathComponent(), localStorageDirectory: directory)
let _ = DatasetUtilities.downloadResource(
filename: "cifar-10-binary", fileExtension: "tar.gz",
remoteRoot: location.deletingLastPathComponent(), localStorageDirectory: directory)
}

func loadCIFARFile(named name: String, in directory: URL, normalizing: Bool = true) -> [TensorPair<Float, Int32>] {
let path = directory.appendingPathComponent("cifar-10-batches-bin/\(name)").path

let imageCount = 10000
guard let fileContents = try? Data(contentsOf: URL(fileURLWithPath: path)) else {
printError("Could not read dataset file: \(name)")
exit(-1)
}
guard fileContents.count == 30_730_000 else {
printError(
"Dataset file \(name) should have 30730000 bytes, instead had \(fileContents.count)")
exit(-1)
}

var bytes: [UInt8] = []
var labels: [Int64] = []

let imageByteSize = 3073
for imageIndex in 0..<imageCount {
let baseAddress = imageIndex * imageByteSize
labels.append(Int64(fileContents[baseAddress]))
bytes.append(contentsOf: fileContents[(baseAddress + 1)..<(baseAddress + 3073)])
}

let labelTensor = Tensor<Int64>(shape: [imageCount], scalars: labels)
let images = Tensor<UInt8>(shape: [imageCount, 3, 32, 32], scalars: bytes)

// Transpose from the CIFAR-provided N(CHW) to TF's default NHWC.
var imageTensor = Tensor<Float>(images.transposed(permutation: [0, 2, 3, 1]))

// The value of mean and std were calculated with the following Swift code:
// ```
// import TensorFlow
// import Datasets
// import Foundation
// let urlString = "https://storage.googleapis.com/s4tf-hosted-binaries/datasets/CIFAR10/cifar-10-binary.tar.gz"
// let cifar = CIFAR10(batchSize: 50000,
// remoteBinaryArchiveLocation: URL(string: urlString)!,
// normalizing: false)
// for batch in cifar.training.sequenced() {
// let images = Tensor<Double>(batch.first) / 255.0
// let mom = images.moments(squeezingAxes: [0,1,2])
// print("mean: \(mom.mean) std: \(sqrt(mom.variance))")
// }
// ```
if normalizing {
let mean = Tensor<Float>(
[0.4913996898,
0.4821584196,
0.4465309242])
let std = Tensor<Float>(
[0.2470322324,
0.2434851280,
0.2615878417])
imageTensor = ((imageTensor / 255.0) - mean) / std
}

return (0..<imageCount).map { TensorPair(first: imageTensor[$0], second: Tensor<Int32>(labelTensor[$0])) }

func loadCIFARFile(named name: String, in directory: URL) -> [(data: [UInt8], label: Int32)] {
let path = directory.appendingPathComponent("cifar-10-batches-bin/\(name)").path

let imageCount = 10000
guard let fileContents = try? Data(contentsOf: URL(fileURLWithPath: path)) else {
printError("Could not read dataset file: \(name)")
exit(-1)
}
guard fileContents.count == 30_730_000 else {
printError(
"Dataset file \(name) should have 30730000 bytes, instead had \(fileContents.count)")
exit(-1)
}

var labeledImages: [(data: [UInt8], label: Int32)] = []

let imageByteSize = 3073
for imageIndex in 0..<imageCount {
let baseAddress = imageIndex * imageByteSize
let label = Int32(fileContents[baseAddress])
let data = [UInt8](fileContents[(baseAddress + 1)..<(baseAddress + 3073)])
labeledImages.append((data: data, label: label))
}

return labeledImages
}

func loadCIFARTrainingFiles(localStorageDirectory: URL, normalizing: Bool = true) -> [TensorPair<Float, Int32>] {
let data = (1..<6).map {
loadCIFARFile(named: "data_batch_\($0).bin", in: localStorageDirectory, normalizing: normalizing)
}
return data.reduce([], +)
func loadCIFARTrainingFiles(in localStorageDirectory: URL) -> [(data: [UInt8], label: Int32)] {
let data = (1..<6).map {
loadCIFARFile(named: "data_batch_\($0).bin", in: localStorageDirectory)
}
return data.reduce([], +)
}

func loadCIFARTestFile(localStorageDirectory: URL, normalizing: Bool = true) -> [TensorPair<Float, Int32>] {
return loadCIFARFile(named: "test_batch.bin", in: localStorageDirectory, normalizing: normalizing)
func loadCIFARTestFile(in localStorageDirectory: URL) -> [(data: [UInt8], label: Int32)] {
return loadCIFARFile(named: "test_batch.bin", in: localStorageDirectory)
}

func makeBatch<BatchSamples: Collection>(samples: BatchSamples, normalizing: Bool) -> LabeledImage
where BatchSamples.Element == (data: [UInt8], label: Int32) {
let bytes = samples.lazy.map(\.data).reduce(into: [], +=)
let images = Tensor<UInt8>(shape: [samples.count, 3, 32, 32], scalars: bytes)

var imageTensor = Tensor<Float>(images.transposed(permutation: [0, 2, 3, 1]))
imageTensor /= 255.0
if normalizing {
let mean = Tensor<Float>([0.4913996898, 0.4821584196, 0.4465309242])
let std = Tensor<Float>([0.2470322324, 0.2434851280, 0.2615878417])
imageTensor = (imageTensor - mean) / std
}

let labels = Tensor<Int32>(samples.map(\.label))
return LabeledImage(data: imageTensor, label: labels)
}
Loading