|
| 1 | +struct ParseError: Error { |
| 2 | + var message: String |
| 3 | +} |
| 4 | + |
| 5 | +let comma = ",".utf16.first! |
| 6 | +let newline = "\n".utf16.first! |
| 7 | +let carriageReturn = "\n".utf16.first! |
| 8 | +let quote = "\"".utf16.first! |
| 9 | + |
| 10 | +@inline(__always) func parseQuotedField(_ remainder: inout Substring) throws -> Substring? { |
| 11 | + var result: Substring = "" // we accumulate the result |
| 12 | + |
| 13 | + while !remainder.isEmpty { |
| 14 | + guard let nextQuoteIndex = remainder.index(of: "\"") else { |
| 15 | + throw ParseError(message: "Expected a closing \"") |
| 16 | + } |
| 17 | + |
| 18 | + // Append until the next quote |
| 19 | + result += remainder.prefix(upTo: nextQuoteIndex) |
| 20 | + remainder.remove(upToAndIncluding: nextQuoteIndex) |
| 21 | + |
| 22 | + if let peek = remainder.utf16.first { |
| 23 | + switch peek { |
| 24 | + case quote: // two quotes after each other is an escaped quote |
| 25 | + remainder.removeFirst() |
| 26 | + result.append("\"") |
| 27 | + case comma: // field ending |
| 28 | + remainder.removeFirst() |
| 29 | + return result |
| 30 | + default: |
| 31 | + return result |
| 32 | + } |
| 33 | + } else { |
| 34 | + // End of the string |
| 35 | + return result |
| 36 | + } |
| 37 | + } |
| 38 | + |
| 39 | + throw ParseError(message: "Expected a closing quote") |
| 40 | +} |
| 41 | + |
| 42 | +// Consume a single field from `remainder` |
| 43 | +@inline(__always) func parseField(_ remainder: inout Substring) throws -> Substring? { |
| 44 | + guard let start = remainder.utf16.first else { return nil } |
| 45 | + switch start { |
| 46 | + case quote: |
| 47 | + remainder.removeFirst() // remove the first quote |
| 48 | + return try parseQuotedField(&remainder) |
| 49 | + case newline: |
| 50 | + return nil |
| 51 | + default: |
| 52 | + // This is the most common case and should ideally be super fast... |
| 53 | + var index = remainder.utf16.startIndex |
| 54 | + while index < remainder.utf16.endIndex { |
| 55 | + switch remainder.utf16[index] { |
| 56 | + case comma: |
| 57 | + defer { remainder.remove(upToAndIncluding: index) } |
| 58 | + return remainder.prefix(upTo: index) |
| 59 | + case newline: |
| 60 | + let result = remainder.prefix(upTo: index) |
| 61 | + remainder.remove(upTo: index) |
| 62 | + return result |
| 63 | + default: |
| 64 | + remainder.utf16.formIndex(after: &index) |
| 65 | + } |
| 66 | + } |
| 67 | + let result = remainder |
| 68 | + remainder.removeAll() |
| 69 | + return result |
| 70 | + } |
| 71 | +} |
| 72 | + |
| 73 | +extension Substring { |
| 74 | + mutating func remove(upTo index: Index) { |
| 75 | + self = suffix(from: index) |
| 76 | + } |
| 77 | + |
| 78 | + mutating func remove(upToAndIncluding index: Index) { |
| 79 | + self = suffix(from: self.index(after: index)) |
| 80 | + } |
| 81 | +} |
| 82 | + |
| 83 | +// Consume a single line from `remainder` |
| 84 | +func parseLine<State>(_ remainder: inout Substring, result: inout State, processField: (inout State, Int, Substring) -> ()) throws -> Bool { |
| 85 | + var fieldNumber = 0 |
| 86 | + |
| 87 | + while let field = try parseField(&remainder) { |
| 88 | + processField(&result, fieldNumber, field) |
| 89 | + fieldNumber += 1 |
| 90 | + } |
| 91 | + |
| 92 | + if !remainder.isEmpty { |
| 93 | + let next = remainder.utf16[remainder.utf16.startIndex] |
| 94 | + guard next == carriageReturn || next == newline else { |
| 95 | + throw ParseError(message: "Expected a newline or CR, got \(next)") |
| 96 | + } |
| 97 | + |
| 98 | + while let x = remainder.utf16.first, x == carriageReturn || x == newline { |
| 99 | + remainder.utf16.removeFirst() |
| 100 | + } |
| 101 | + } |
| 102 | + |
| 103 | + return !remainder.isEmpty && fieldNumber > 0 |
| 104 | +} |
| 105 | + |
| 106 | +import Foundation |
| 107 | + |
| 108 | +func time<Result>(name: StaticString = #function, line: Int = #line, _ f: () throws -> Result) rethrows -> Result { |
| 109 | + let startTime = DispatchTime.now() |
| 110 | + let result = try f() |
| 111 | + let endTime = DispatchTime.now() |
| 112 | + let diff = Double(endTime.uptimeNanoseconds - startTime.uptimeNanoseconds) / 1_000_000_000 as Double |
| 113 | + print("\(name) (line \(line)): \(diff) sec") |
| 114 | + return result |
| 115 | +} |
| 116 | + |
| 117 | +try time { |
| 118 | + // URL for this file: http://www.maxmind.com/download/worldcities/worldcitiespop.txt.gz |
| 119 | + // let file = URL(fileURLWithPath: "/Users/chris/Downloads/1489325/stops.txt") |
| 120 | + let file = URL(fileURLWithPath: "worldcitiespop.txt") |
| 121 | + |
| 122 | + // The + "\n" is a a trick by Ole Begemann, which forces the String to be a Swift String (not an NSString). It makes it more than twice as fast on my computer... |
| 123 | + let contents = try String(contentsOf: file, encoding: .isoLatin1) + "" |
| 124 | + |
| 125 | + var remainder = contents[...] |
| 126 | + |
| 127 | + var result: Int = 0 |
| 128 | + var x: () = () |
| 129 | + |
| 130 | + while !remainder.isEmpty { |
| 131 | + _ = try parseLine(&remainder, result: &x, processField: { state, _, field in |
| 132 | + () |
| 133 | + }) |
| 134 | + if result < 10 { |
| 135 | + print("result: \(result)") |
| 136 | + } |
| 137 | + result += 1 |
| 138 | + if result % 100_000 == 0 { |
| 139 | + print(result) |
| 140 | + } |
| 141 | + } |
| 142 | + print(result) |
| 143 | + |
| 144 | +} |
0 commit comments