Skip to content

Commit d368524

Browse files
toyama0919jesterhazy
authored andcommitted
support binary by NoneSplitter. (#954)
1 parent a999b16 commit d368524

File tree

2 files changed

+33
-5
lines changed

2 files changed

+33
-5
lines changed

src/sagemaker/local/data.py

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -221,20 +221,39 @@ def split(self, file):
221221
class NoneSplitter(Splitter):
222222
"""Does not split records, essentially reads the whole file."""
223223

224-
def split(self, file):
224+
# non-utf8 characters.
225+
_textchars = bytearray({7, 8, 9, 10, 12, 13, 27} | set(range(0x20, 0x100)) - {0x7F})
226+
227+
def split(self, filename):
225228
"""Split a file into records using a specific strategy.
226229
227230
For this NoneSplitter there is no actual split happening and the file
228231
is returned as a whole.
229232
230233
Args:
231-
file (str): path to the file to split
234+
filename (str): path to the file to split
232235
233236
Returns: generator for the individual records that were split from
234237
the file
235238
"""
236-
with open(file, "r") as f:
237-
yield f.read()
239+
with open(filename, "rb") as f:
240+
buf = f.read()
241+
if not self._is_binary(buf):
242+
buf = buf.decode()
243+
yield buf
244+
245+
def _is_binary(self, buf):
246+
"""binary check.
247+
Check whether `buf` contains binary data.
248+
Returns true if `buf` contains any non-utf-8 characters.
249+
250+
Args:
251+
buf (bytes): data to inspect
252+
253+
Returns:
254+
True if data is binary, otherwise False
255+
"""
256+
return bool(buf.translate(None, self._textchars))
238257

239258

240259
class LineSplitter(Splitter):

tests/unit/test_local_data.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,15 +110,24 @@ def test_get_splitter_instance_with_invalid_types():
110110

111111

112112
def test_none_splitter(tmpdir):
113+
splitter = sagemaker.local.data.NoneSplitter()
114+
113115
test_file_path = tmpdir.join("none_test.txt")
114116

115117
with test_file_path.open("w") as f:
116118
f.write("this\nis\na\ntest")
117119

118-
splitter = sagemaker.local.data.NoneSplitter()
119120
data = [x for x in splitter.split(str(test_file_path))]
120121
assert data == ["this\nis\na\ntest"]
121122

123+
test_bin_file_path = tmpdir.join("none_test.bin")
124+
125+
with test_bin_file_path.open("wb") as f:
126+
f.write(b"\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00\x00\x01\x00\x01\x00\x00C")
127+
128+
data = [x for x in splitter.split(str(test_bin_file_path))]
129+
assert data == [b"\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00\x00\x01\x00\x01\x00\x00C"]
130+
122131

123132
def test_line_splitter(tmpdir):
124133
test_file_path = tmpdir.join("line_test.txt")

0 commit comments

Comments
 (0)