Skip to content

Commit 10ec72a

Browse files
committed
fix binary check.
1 parent 0e084d8 commit 10ec72a

File tree

1 file changed

+20
-2
lines changed

1 file changed

+20
-2
lines changed

src/sagemaker/local/data.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,8 @@ def split(self, file):
221221
class NoneSplitter(Splitter):
222222
"""Does not split records, essentially reads the whole file."""
223223

224+
_textchars = bytearray({7, 8, 9, 10, 12, 13, 27} | set(range(0x20, 0x100)) - {0x7F})
225+
224226
def split(self, file):
225227
"""Split a file into records using a specific strategy.
226228
@@ -233,8 +235,24 @@ def split(self, file):
233235
Returns: generator for the individual records that were split from
234236
the file
235237
"""
236-
with open(file, "r+b") as f:
237-
yield f.read()
238+
with open(file, "rb") as f:
239+
buf = f.read()
240+
if not self._is_binary(buf):
241+
buf = buf.decode()
242+
yield buf
243+
244+
def _is_binary(self, buf):
245+
"""binary check.
246+
247+
binary or text check.
248+
249+
Args:
250+
buf (bytes): bytes in target file.
251+
252+
Returns:
253+
is binary(True) or text(False).
254+
"""
255+
return bool(buf.translate(None, self._textchars))
238256

239257

240258
class LineSplitter(Splitter):

0 commit comments

Comments
 (0)