Skip to content

Commit 06379db

Browse files
[DOCS] Adds descriptions to ML APIs (#2245)
Co-authored-by: Abdon Pijpelink <[email protected]>
1 parent a2e0bd9 commit 06379db

File tree

11 files changed

+754
-66
lines changed

11 files changed

+754
-66
lines changed

output/schema/schema.json

Lines changed: 187 additions & 66 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

specification/_doc_ids/table.csv

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,7 @@ ml-delete-snapshot,https://www.elastic.co/guide/en/elasticsearch/reference/{bran
248248
ml-feature-importance,https://www.elastic.co/guide/en/machine-learning/{branch}/ml-feature-importance.html
249249
ml-flush-job,https://www.elastic.co/guide/en/elasticsearch/reference/{branch}/ml-flush-job.html
250250
ml-forecast,https://www.elastic.co/guide/en/elasticsearch/reference/{branch}/ml-forecast.html
251+
ml-functions,https://www.elastic.co/guide/en/machine-learning/{branch}/ml-functions.html
251252
ml-get-bucket,https://www.elastic.co/guide/en/elasticsearch/reference/{branch}/ml-get-bucket.html
252253
ml-get-calendar-event,https://www.elastic.co/guide/en/elasticsearch/reference/{branch}/ml-get-calendar-event.html
253254
ml-get-calendar,https://www.elastic.co/guide/en/elasticsearch/reference/{branch}/ml-get-calendar.html

specification/ml/_types/Analysis.ts

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,16 +77,73 @@ whole number of buckets in one day. If the anomaly detection job uses a datafeed
7777
}
7878

7979
export class AnalysisConfigRead implements OverloadOf<AnalysisConfig> {
80+
/**
81+
* The size of the interval that the analysis is aggregated into, typically between `5m` and `1h`.
82+
*/
8083
bucket_span: Duration
84+
/**
85+
* If `categorization_field_name` is specified, you can also define the analyzer that is used to interpret the categorization field.
86+
* This property cannot be used at the same time as `categorization_filters`.
87+
* The categorization analyzer specifies how the `categorization_field` is interpreted by the categorization process.
88+
*/
8189
categorization_analyzer?: CategorizationAnalyzer
90+
/**
91+
* If this property is specified, the values of the specified field will be categorized.
92+
* The resulting categories must be used in a detector by setting `by_field_name`, `over_field_name`, or `partition_field_name` to the keyword `mlcategory`.
93+
*/
8294
categorization_field_name?: Field
95+
/**
96+
* If `categorization_field_name` is specified, you can also define optional filters.
97+
* This property expects an array of regular expressions.
98+
* The expressions are used to filter out matching sequences from the categorization field values.
99+
*/
83100
categorization_filters?: string[]
101+
/**
102+
* An array of detector configuration objects.
103+
* Detector configuration objects specify which data fields a job analyzes.
104+
* They also specify which analytical functions are used.
105+
* You can specify multiple detectors for a job.
106+
*/
84107
detectors: DetectorRead[]
108+
/**
109+
* A comma separated list of influencer field names.
110+
* Typically these can be the by, over, or partition fields that are used in the detector configuration.
111+
* You might also want to use a field name that is not specifically named in a detector, but is available as part of the input data.
112+
* When you use multiple detectors, the use of influencers is recommended as it aggregates results for each influencer entity.
113+
*/
85114
influencers: Field[]
115+
/**
116+
* Advanced configuration option.
117+
* Affects the pruning of models that have not been updated for the given time duration.
118+
* The value must be set to a multiple of the `bucket_span`.
119+
* If set too low, important information may be removed from the model.
120+
* Typically, set to `30d` or longer.
121+
* If not set, model pruning only occurs if the model memory status reaches the soft limit or the hard limit.
122+
* For jobs created in 8.1 and later, the default value is the greater of `30d` or 20 times `bucket_span`.
123+
*/
86124
model_prune_window?: Duration
125+
/**
126+
* The size of the window in which to expect data that is out of time order.
127+
* Defaults to no latency.
128+
* If you specify a non-zero value, it must be greater than or equal to one second.
129+
* @server_default 0
130+
*/
87131
latency?: Duration
132+
/**
133+
* This functionality is reserved for internal use.
134+
* It is not supported for use in customer environments and is not subject to the support SLA of official GA features.
135+
* If set to `true`, the analysis will automatically find correlations between metrics for a given by field value and report anomalies when those correlations cease to hold.
136+
*/
88137
multivariate_by_fields?: boolean
138+
/**
139+
* Settings related to how categorization interacts with partition fields.
140+
*/
89141
per_partition_categorization?: PerPartitionCategorization
142+
/**
143+
* If this property is specified, the data that is fed to the job is expected to be pre-summarized.
144+
* This property value is the name of the field that contains the count of raw data points that have been summarized.
145+
* The same `summary_count_field_name` applies to all detectors in the job.
146+
*/
90147
summary_count_field_name?: Field
91148
}
92149

specification/ml/_types/Datafeed.ts

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,36 +138,95 @@ export enum DatafeedState {
138138
}
139139

140140
export class DatafeedStats {
141+
/**
142+
* For started datafeeds only, contains messages relating to the selection of a node.
143+
*/
141144
assignment_explanation?: string
145+
/**
146+
* A numerical character string that uniquely identifies the datafeed.
147+
* This identifier can contain lowercase alphanumeric characters (a-z and 0-9), hyphens, and underscores.
148+
* It must start and end with alphanumeric characters.
149+
*/
142150
datafeed_id: Id
143151
/**
152+
* For started datafeeds only, this information pertains to the node upon which the datafeed is started.
144153
* @availability stack
145154
*/
146155
node?: DiscoveryNode
156+
/**
157+
* The status of the datafeed, which can be one of the following values: `starting`, `started`, `stopping`, `stopped`.
158+
*/
147159
state: DatafeedState
160+
/**
161+
* An object that provides statistical information about timing aspect of this datafeed.
162+
*/
148163
timing_stats: DatafeedTimingStats
164+
/**
165+
* An object containing the running state for this datafeed.
166+
* It is only provided if the datafeed is started.
167+
*/
149168
running_state?: DatafeedRunningState
150169
}
151170

152171
export class DatafeedTimingStats {
172+
/**
173+
* The number of buckets processed.
174+
*/
153175
bucket_count: long
176+
/**
177+
* The exponential average search time per hour, in milliseconds.
178+
*/
154179
exponential_average_search_time_per_hour_ms: DurationValue<UnitFloatMillis>
180+
/**
181+
* Identifier for the anomaly detection job.
182+
*/
155183
job_id: Id
184+
/**
185+
* The number of searches run by the datafeed.
186+
*/
156187
search_count: long
188+
/**
189+
* The total time the datafeed spent searching, in milliseconds.
190+
*/
157191
total_search_time_ms: DurationValue<UnitFloatMillis>
192+
/**
193+
* The average search time per bucket, in milliseconds.
194+
*/
158195
average_search_time_per_bucket_ms?: DurationValue<UnitFloatMillis>
159196
}
160197

161198
export class DatafeedRunningState {
199+
/**
200+
* Indicates if the datafeed is "real-time"; meaning that the datafeed has no configured `end` time.
201+
*/
162202
real_time_configured: boolean
203+
/**
204+
* Indicates whether the datafeed has finished running on the available past data.
205+
* For datafeeds without a configured `end` time, this means that the datafeed is now running on "real-time" data.
206+
*/
163207
real_time_running: boolean
208+
/**
209+
* Provides the latest time interval the datafeed has searched.
210+
*/
164211
search_interval?: RunningStateSearchInterval
165212
}
166213

167214
export class RunningStateSearchInterval {
215+
/**
216+
* The end time.
217+
*/
168218
end?: Duration
219+
/**
220+
* The end time as an epoch in milliseconds.
221+
*/
169222
end_ms: DurationValue<UnitMillis>
223+
/**
224+
* The start time.
225+
*/
170226
start?: Duration
227+
/**
228+
* The start time as an epoch in milliseconds.
229+
*/
171230
start_ms: DurationValue<UnitMillis>
172231
}
173232

specification/ml/_types/DataframeAnalytics.ts

Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -381,43 +381,182 @@ export class DataframeAnalyticsStatsContainer {
381381
}
382382

383383
export class DataframeAnalyticsStatsHyperparameters {
384+
/**
385+
* An object containing the parameters of the classification analysis job.
386+
*/
384387
hyperparameters: Hyperparameters
385388
/** The number of iterations on the analysis. */
386389
iteration: integer
390+
/**
391+
* The timestamp when the statistics were reported in milliseconds since the epoch.
392+
*/
387393
timestamp: EpochTime<UnitMillis>
394+
/**
395+
* An object containing time statistics about the data frame analytics job.
396+
*/
388397
timing_stats: TimingStats
398+
/**
399+
* An object containing information about validation loss.
400+
*/
389401
validation_loss: ValidationLoss
390402
}
391403

392404
export class DataframeAnalyticsStatsOutlierDetection {
405+
/**
406+
* The list of job parameters specified by the user or determined by algorithmic heuristics.
407+
*/
393408
parameters: OutlierDetectionParameters
409+
/**
410+
* The timestamp when the statistics were reported in milliseconds since the epoch.
411+
*/
394412
timestamp: EpochTime<UnitMillis>
413+
/**
414+
* An object containing time statistics about the data frame analytics job.
415+
*/
395416
timing_stats: TimingStats
396417
}
397418

398419
export class Hyperparameters {
420+
/**
421+
* Advanced configuration option.
422+
* Machine learning uses loss guided tree growing, which means that the decision trees grow where the regularized loss decreases most quickly.
423+
* This parameter affects loss calculations by acting as a multiplier of the tree depth.
424+
* Higher alpha values result in shallower trees and faster training times.
425+
* By default, this value is calculated during hyperparameter optimization.
426+
* It must be greater than or equal to zero.
427+
*/
399428
alpha?: double
429+
/**
430+
* Advanced configuration option.
431+
* Regularization parameter to prevent overfitting on the training data set.
432+
* Multiplies an L2 regularization term which applies to leaf weights of the individual trees in the forest.
433+
* A high lambda value causes training to favor small leaf weights.
434+
* This behavior makes the prediction function smoother at the expense of potentially not being able to capture relevant relationships between the features and the dependent variable.
435+
* A small lambda value results in large individual trees and slower training.
436+
* By default, this value is calculated during hyperparameter optimization.
437+
* It must be a nonnegative value.
438+
*/
400439
lambda?: double
440+
/**
441+
* Advanced configuration option.
442+
* Regularization parameter to prevent overfitting on the training data set.
443+
* Multiplies a linear penalty associated with the size of individual trees in the forest.
444+
* A high gamma value causes training to prefer small trees.
445+
* A small gamma value results in larger individual trees and slower training.
446+
* By default, this value is calculated during hyperparameter optimization.
447+
* It must be a nonnegative value.
448+
*/
401449
gamma?: double
450+
/**
451+
* Advanced configuration option.
452+
* The shrinkage applied to the weights.
453+
* Smaller values result in larger forests which have a better generalization error.
454+
* However, larger forests cause slower training.
455+
* By default, this value is calculated during hyperparameter optimization.
456+
* It must be a value between `0.001` and `1`.
457+
*/
402458
eta?: double
459+
/**
460+
* Advanced configuration option.
461+
* Specifies the rate at which `eta` increases for each new tree that is added to the forest.
462+
* For example, a rate of 1.05 increases `eta` by 5% for each extra tree.
463+
* By default, this value is calculated during hyperparameter optimization.
464+
* It must be between `0.5` and `2`.
465+
*/
403466
eta_growth_rate_per_tree?: double
467+
/**
468+
* Advanced configuration option.
469+
* Defines the fraction of features that will be used when selecting a random bag for each candidate split.
470+
* By default, this value is calculated during hyperparameter optimization.
471+
*/
404472
feature_bag_fraction?: double
473+
/**
474+
* Advanced configuration option.
475+
* Controls the fraction of data that is used to compute the derivatives of the loss function for tree training.
476+
* A small value results in the use of a small fraction of the data.
477+
* If this value is set to be less than 1, accuracy typically improves.
478+
* However, too small a value may result in poor convergence for the ensemble and so require more trees.
479+
* By default, this value is calculated during hyperparameter optimization.
480+
* It must be greater than zero and less than or equal to 1.
481+
*/
405482
downsample_factor?: double
483+
/**
484+
* If the algorithm fails to determine a non-trivial tree (more than a single leaf), this parameter determines how many of such consecutive failures are tolerated.
485+
* Once the number of attempts exceeds the threshold, the forest training stops.
486+
*/
406487
max_attempts_to_add_tree?: integer
488+
/**
489+
* Advanced configuration option.
490+
* A multiplier responsible for determining the maximum number of hyperparameter optimization steps in the Bayesian optimization procedure.
491+
* The maximum number of steps is determined based on the number of undefined hyperparameters times the maximum optimization rounds per hyperparameter.
492+
* By default, this value is calculated during hyperparameter optimization.
493+
*/
407494
max_optimization_rounds_per_hyperparameter?: integer
495+
/**
496+
* Advanced configuration option.
497+
* Defines the maximum number of decision trees in the forest.
498+
* The maximum value is 2000.
499+
* By default, this value is calculated during hyperparameter optimization.
500+
*/
408501
max_trees?: integer
502+
/**
503+
* The maximum number of folds for the cross-validation procedure.
504+
*/
409505
num_folds?: integer
506+
/**
507+
* Determines the maximum number of splits for every feature that can occur in a decision tree when the tree is trained.
508+
*/
410509
num_splits_per_feature?: integer
510+
/**
511+
* Advanced configuration option.
512+
* Machine learning uses loss guided tree growing, which means that the decision trees grow where the regularized loss decreases most quickly.
513+
* This soft limit combines with the `soft_tree_depth_tolerance` to penalize trees that exceed the specified depth; the regularized loss increases quickly beyond this depth.
514+
* By default, this value is calculated during hyperparameter optimization.
515+
* It must be greater than or equal to 0.
516+
*/
411517
soft_tree_depth_limit?: integer
518+
/**
519+
* Advanced configuration option.
520+
* This option controls how quickly the regularized loss increases when the tree depth exceeds `soft_tree_depth_limit`.
521+
* By default, this value is calculated during hyperparameter optimization.
522+
* It must be greater than or equal to 0.01.
523+
*/
412524
soft_tree_depth_tolerance?: double
413525
}
414526

415527
export class OutlierDetectionParameters {
528+
/**
529+
* Specifies whether the feature influence calculation is enabled.
530+
* @server_default true
531+
*/
416532
compute_feature_influence?: boolean
533+
/**
534+
* The minimum outlier score that a document needs to have in order to calculate its feature influence score.
535+
* Value range: 0-1
536+
* @server_default 0.1
537+
*/
417538
feature_influence_threshold?: double
539+
/**
540+
* The method that outlier detection uses.
541+
* Available methods are `lof`, `ldof`, `distance_kth_nn`, `distance_knn`, and `ensemble`.
542+
* The default value is ensemble, which means that outlier detection uses an ensemble of different methods and normalises and combines their individual outlier scores to obtain the overall outlier score.
543+
*/
418544
method?: string
545+
/**
546+
* Defines the value for how many nearest neighbors each method of outlier detection uses to calculate its outlier score.
547+
* When the value is not set, different values are used for different ensemble members.
548+
* This default behavior helps improve the diversity in the ensemble; only override it if you are confident that the value you choose is appropriate for the data set.
549+
*/
419550
n_neighbors?: integer
551+
/**
552+
* The proportion of the data set that is assumed to be outlying prior to outlier detection.
553+
* For example, 0.05 means it is assumed that 5% of values are real outliers and 95% are inliers.
554+
*/
420555
outlier_fraction?: double
556+
/**
557+
* If `true`, the following operation is performed on the columns before computing outlier scores: (x_i - mean(x_i)) / sd(x_i).
558+
* @server_default true
559+
*/
421560
standardization_enabled?: boolean
422561
}
423562

0 commit comments

Comments
 (0)