@@ -173,21 +173,33 @@ def __init__(
173
173
174
174
def get_run_args (
175
175
self ,
176
- submit_app ,
176
+ code = None ,
177
177
inputs = None ,
178
178
outputs = None ,
179
179
arguments = None ,
180
- job_name = None ,
181
- kms_key = None ,
182
180
):
183
- # TODO: description
181
+ """Returns a RunArgs object. For processors (:class:`~sagemaker.spark.processing.PySparkProcessor`,
182
+ :class:`~sagemaker.spark.processing.SparkJar`) that have special
183
+ run() arguments, this object contains the normalized arguments for passing to
184
+ :class:`~sagemaker.workflow.steps.ProcessingStep`.
185
+
186
+ Args:
187
+ code (str): This can be an S3 URI or a local path to a file with the framework
188
+ script to run.
189
+ inputs (list[:class:`~sagemaker.processing.ProcessingInput`]): Input files for
190
+ the processing job. These must be provided as
191
+ :class:`~sagemaker.processing.ProcessingInput` objects (default: None).
192
+ outputs (list[:class:`~sagemaker.processing.ProcessingOutput`]): Outputs for
193
+ the processing job. These can be specified as either path strings or
194
+ :class:`~sagemaker.processing.ProcessingOutput` objects (default: None).
195
+ arguments (list[str]): A list of string arguments to be passed to a
196
+ processing job (default: None).
197
+ """
184
198
return super ().get_run_args (
185
- code = submit_app ,
199
+ code = code ,
186
200
inputs = inputs ,
187
201
outputs = outputs ,
188
202
arguments = arguments ,
189
- job_name = job_name ,
190
- kms_key = kms_key ,
191
203
)
192
204
193
205
def run (
@@ -716,8 +728,35 @@ def get_run_args(
716
728
job_name = None ,
717
729
configuration = None ,
718
730
spark_event_logs_s3_uri = None ,
719
- kms_key = None ,
720
731
):
732
+ """Returns a RunArgs object. This object contains the normalized inputs, outputs
733
+ and arguments needed when creating using a ``PySparkProcessor`` in a :class:`~sagemaker.workflow.steps.ProcessingStep`.
734
+
735
+ Args:
736
+ submit_app (str): Path (local or S3) to Python file to submit to Spark
737
+ as the primary application
738
+ submit_py_files (list[str]): List of paths (local or S3) to provide for
739
+ `spark-submit --py-files` option
740
+ submit_jars (list[str]): List of paths (local or S3) to provide for
741
+ `spark-submit --jars` option
742
+ submit_files (list[str]): List of paths (local or S3) to provide for
743
+ `spark-submit --files` option
744
+ inputs (list[:class:`~sagemaker.processing.ProcessingInput`]): Input files for
745
+ the processing job. These must be provided as
746
+ :class:`~sagemaker.processing.ProcessingInput` objects (default: None).
747
+ outputs (list[:class:`~sagemaker.processing.ProcessingOutput`]): Outputs for
748
+ the processing job. These can be specified as either path strings or
749
+ :class:`~sagemaker.processing.ProcessingOutput` objects (default: None).
750
+ arguments (list[str]): A list of string arguments to be passed to a
751
+ processing job (default: None).
752
+ job_name (str): Processing job name. If not specified, the processor generates
753
+ a default job name, based on the base job name and current timestamp.
754
+ configuration (list[dict] or dict): Configuration for Hadoop, Spark, or Hive.
755
+ List or dictionary of EMR-style classifications.
756
+ https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-configure-apps.html
757
+ spark_event_logs_s3_uri (str): S3 path where spark application events will
758
+ be published to.
759
+ """
721
760
self ._current_job_name = self ._generate_current_job_name (job_name = job_name )
722
761
self .command = [_SparkProcessorBase ._default_command ]
723
762
@@ -734,14 +773,11 @@ def get_run_args(
734
773
spark_event_logs_s3_uri = spark_event_logs_s3_uri ,
735
774
)
736
775
737
- # TODO: description
738
776
return super ().get_run_args (
739
- submit_app = submit_app ,
777
+ code = submit_app ,
740
778
inputs = extended_inputs ,
741
779
outputs = extended_outputs ,
742
780
arguments = arguments ,
743
- job_name = self ._current_job_name ,
744
- kms_key = kms_key ,
745
781
)
746
782
747
783
def run (
@@ -821,6 +857,7 @@ def run(
821
857
logs = logs ,
822
858
job_name = self ._current_job_name ,
823
859
experiment_config = experiment_config ,
860
+ kms_key = kms_key ,
824
861
)
825
862
826
863
def _extend_processing_args (self , inputs , outputs , ** kwargs ):
@@ -937,8 +974,35 @@ def get_run_args(
937
974
job_name = None ,
938
975
configuration = None ,
939
976
spark_event_logs_s3_uri = None ,
940
- kms_key = None ,
941
977
):
978
+ """Returns a RunArgs object. This object contains the normalized inputs, outputs
979
+ and arguments needed when creating using a ``SparkJarProcessor`` in a :class:`~sagemaker.workflow.steps.ProcessingStep`.
980
+
981
+ Args:
982
+ submit_app (str): Path (local or S3) to Python file to submit to Spark
983
+ as the primary application
984
+ submit_class (str): Java class reference to submit to Spark as the primary
985
+ application
986
+ submit_jars (list[str]): List of paths (local or S3) to provide for
987
+ `spark-submit --jars` option
988
+ submit_files (list[str]): List of paths (local or S3) to provide for
989
+ `spark-submit --files` option
990
+ inputs (list[:class:`~sagemaker.processing.ProcessingInput`]): Input files for
991
+ the processing job. These must be provided as
992
+ :class:`~sagemaker.processing.ProcessingInput` objects (default: None).
993
+ outputs (list[:class:`~sagemaker.processing.ProcessingOutput`]): Outputs for
994
+ the processing job. These can be specified as either path strings or
995
+ :class:`~sagemaker.processing.ProcessingOutput` objects (default: None).
996
+ arguments (list[str]): A list of string arguments to be passed to a
997
+ processing job (default: None).
998
+ job_name (str): Processing job name. If not specified, the processor generates
999
+ a default job name, based on the base job name and current timestamp.
1000
+ configuration (list[dict] or dict): Configuration for Hadoop, Spark, or Hive.
1001
+ List or dictionary of EMR-style classifications.
1002
+ https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-configure-apps.html
1003
+ spark_event_logs_s3_uri (str): S3 path where spark application events will
1004
+ be published to.
1005
+ """
942
1006
self ._current_job_name = self ._generate_current_job_name (job_name = job_name )
943
1007
self .command = [_SparkProcessorBase ._default_command ]
944
1008
@@ -955,14 +1019,11 @@ def get_run_args(
955
1019
spark_event_logs_s3_uri = spark_event_logs_s3_uri ,
956
1020
)
957
1021
958
- # TODO: description
959
1022
return super ().get_run_args (
960
- submit_app = submit_app ,
1023
+ code = submit_app ,
961
1024
inputs = extended_inputs ,
962
1025
outputs = extended_outputs ,
963
1026
arguments = arguments ,
964
- job_name = self ._current_job_name ,
965
- kms_key = kms_key ,
966
1027
)
967
1028
968
1029
def run (
0 commit comments