Skip to content

Commit 75cfcb1

Browse files
brockwade633Brock Wade
andauthored
fix: security update -> use sha256 instead of md5 for file hashing (#4965)
* fix: security update -> use sha256 instead of md5 for file hashing * fix: security update -> use sha256 instead of md5 for file hashing * fix flake8 * fix: test spacing --------- Co-authored-by: Brock Wade <[email protected]>
1 parent e13078f commit 75cfcb1

File tree

4 files changed

+32
-31
lines changed

4 files changed

+32
-31
lines changed

src/sagemaker/workflow/utilities.py

Lines changed: 27 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -268,29 +268,29 @@ def get_config_hash(step: Entity):
268268

269269

270270
def hash_object(obj) -> str:
271-
"""Get the MD5 hash of an object.
271+
"""Get the SHA256 hash of an object.
272272
273273
Args:
274274
obj (dict): The object
275275
Returns:
276-
str: The MD5 hash of the object
276+
str: The SHA256 hash of the object
277277
"""
278-
return hashlib.md5(str(obj).encode()).hexdigest()
278+
return hashlib.sha256(str(obj).encode()).hexdigest()
279279

280280

281281
def hash_file(path: str) -> str:
282-
"""Get the MD5 hash of a file.
282+
"""Get the SHA256 hash of a file.
283283
284284
Args:
285285
path (str): The local path for the file.
286286
Returns:
287-
str: The MD5 hash of the file.
287+
str: The SHA256 hash of the file.
288288
"""
289-
return _hash_file(path, hashlib.md5()).hexdigest()
289+
return _hash_file(path, hashlib.sha256()).hexdigest()
290290

291291

292292
def hash_files_or_dirs(paths: List[str]) -> str:
293-
"""Get the MD5 hash of the contents of a list of files or directories.
293+
"""Get the SHA256 hash of the contents of a list of files or directories.
294294
295295
Hash is changed if:
296296
* input list is changed
@@ -301,58 +301,58 @@ def hash_files_or_dirs(paths: List[str]) -> str:
301301
Args:
302302
paths: List of file or directory paths
303303
Returns:
304-
str: The MD5 hash of the list of files or directories.
304+
str: The SHA256 hash of the list of files or directories.
305305
"""
306-
md5 = hashlib.md5()
306+
sha256 = hashlib.sha256()
307307
for path in sorted(paths):
308-
md5 = _hash_file_or_dir(path, md5)
309-
return md5.hexdigest()
308+
sha256 = _hash_file_or_dir(path, sha256)
309+
return sha256.hexdigest()
310310

311311

312-
def _hash_file_or_dir(path: str, md5: Hash) -> Hash:
312+
def _hash_file_or_dir(path: str, sha256: Hash) -> Hash:
313313
"""Updates the inputted Hash with the contents of the current path.
314314
315315
Args:
316316
path: path of file or directory
317317
Returns:
318-
str: The MD5 hash of the file or directory
318+
str: The SHA256 hash of the file or directory
319319
"""
320320
if isinstance(path, str) and path.lower().startswith("file://"):
321321
path = unquote(urlparse(path).path)
322-
md5.update(path.encode())
322+
sha256.update(path.encode())
323323
if Path(path).is_dir():
324-
md5 = _hash_dir(path, md5)
324+
sha256 = _hash_dir(path, sha256)
325325
elif Path(path).is_file():
326-
md5 = _hash_file(path, md5)
327-
return md5
326+
sha256 = _hash_file(path, sha256)
327+
return sha256
328328

329329

330-
def _hash_dir(directory: Union[str, Path], md5: Hash) -> Hash:
330+
def _hash_dir(directory: Union[str, Path], sha256: Hash) -> Hash:
331331
"""Updates the inputted Hash with the contents of the current path.
332332
333333
Args:
334334
directory: path of the directory
335335
Returns:
336-
str: The MD5 hash of the directory
336+
str: The SHA256 hash of the directory
337337
"""
338338
if not Path(directory).is_dir():
339339
raise ValueError(str(directory) + " is not a valid directory")
340340
for path in sorted(Path(directory).iterdir()):
341-
md5.update(path.name.encode())
341+
sha256.update(path.name.encode())
342342
if path.is_file():
343-
md5 = _hash_file(path, md5)
343+
sha256 = _hash_file(path, sha256)
344344
elif path.is_dir():
345-
md5 = _hash_dir(path, md5)
346-
return md5
345+
sha256 = _hash_dir(path, sha256)
346+
return sha256
347347

348348

349-
def _hash_file(file: Union[str, Path], md5: Hash) -> Hash:
349+
def _hash_file(file: Union[str, Path], sha256: Hash) -> Hash:
350350
"""Updates the inputted Hash with the contents of the current path.
351351
352352
Args:
353353
file: path of the file
354354
Returns:
355-
str: The MD5 hash of the file
355+
str: The SHA256 hash of the file
356356
"""
357357
if isinstance(file, str) and file.lower().startswith("file://"):
358358
file = unquote(urlparse(file).path)
@@ -363,8 +363,8 @@ def _hash_file(file: Union[str, Path], md5: Hash) -> Hash:
363363
data = f.read(BUF_SIZE)
364364
if not data:
365365
break
366-
md5.update(data)
367-
return md5
366+
sha256.update(data)
367+
return sha256
368368

369369

370370
def validate_step_args_input(

tests/unit/sagemaker/workflow/test_steps.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -671,7 +671,7 @@ def test_processing_step_normalizes_args_with_local_code(mock_normalize_args, sc
671671
mock_normalize_args.return_value = [step.inputs, step.outputs]
672672
step.to_request()
673673
mock_normalize_args.assert_called_with(
674-
job_name="MyProcessingStep-3e89f0c7e101c356cbedf27d9d27e9db",
674+
job_name="MyProcessingStep-a22fc59b38f13da26f6a40b18687ba598cf669f74104b793cefd9c63eddf4ac7",
675675
arguments=step.job_arguments,
676676
inputs=step.inputs,
677677
outputs=step.outputs,

tests/unit/sagemaker/workflow/test_utilities.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,14 +31,14 @@ def test_hash_file():
3131
with tempfile.NamedTemporaryFile() as tmp:
3232
tmp.write("hashme".encode())
3333
hash = hash_file(tmp.name)
34-
assert hash == "d41d8cd98f00b204e9800998ecf8427e"
34+
assert hash == "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"
3535

3636

3737
def test_hash_file_uri():
3838
with tempfile.NamedTemporaryFile() as tmp:
3939
tmp.write("hashme".encode())
4040
hash = hash_file(f"file:///{tmp.name}")
41-
assert hash == "d41d8cd98f00b204e9800998ecf8427e"
41+
assert hash == "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"
4242

4343

4444
def test_hash_files_or_dirs_with_file():

tests/unit/sagemaker/workflow/test_utils.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,8 @@ def test_repack_model_step(estimator):
8282
assert hyperparameters["sagemaker_program"] == f'"{REPACK_SCRIPT_LAUNCHER}"'
8383
assert (
8484
hyperparameters["sagemaker_submit_directory"]
85-
== '"s3://my-bucket/MyRepackModelStep-b5ea77f701b47a8d075605497462ccc2/source/sourcedir.tar.gz"'
85+
== '"s3://my-bucket/MyRepackModelStep-717d7bdd388168c27e9ad2938ff0314e35be50b3157cf2498688c7525ea27e1e\
86+
/source/sourcedir.tar.gz"'
8687
)
8788

8889
del request_dict["Arguments"]["HyperParameters"]

0 commit comments

Comments
 (0)