Add overlay for tokenizer in ML analysis_config

lcawl · lcawl · commit 5f98fec684d0 · 2024-09-17T18:07:42.000-07:00
diff --git a/docs/overlays/elasticsearch-openapi-overlays.yaml b/docs/overlays/elasticsearch-openapi-overlays.yaml
@@ -56,4 +56,25 @@ actions:
           By default, this property has the following value: `{"match_all": {"boost": 1}}`.
         externalDocs:
           url: https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html
-          description: Query DSL
+          description: Query DSL
+  - target: "$.components['schemas']['ml._types:CategorizationAnalyzerDefinition'].properties.tokenizer"
+    description: Remove tokenizer object from ML anomaly detection analysis config
+    remove: true
+  - target: "$.components['schemas']['ml._types:CategorizationAnalyzerDefinition'].properties"
+    description: Re-add a simplified tokenizer object in ML anomaly detection analysis config
+    update:
+      tokenizer:
+        x-abbreviated: true
+        oneOf:
+          - type: object
+          - type: string
+        description: >
+          The name or definition of the tokenizer to use after character filters are applied.
+          This property is compulsory if `categorization_analyzer` is specified as an object.
+          Machine learning provides a tokenizer called `ml_standard` that tokenizes in a way that has been determined to produce good categorization results on a variety of log file formats for logs in English.
+          If you want to use that tokenizer but change the character or token filters, specify `"tokenizer": "ml_standard"` in your `categorization_analyzer`.
+          Additionally, the `ml_classic` tokenizer is available, which tokenizes in the same way as the non-customizable tokenizer in old versions of the product (before 6.2).
+          `ml_classic` was the default categorization tokenizer in versions 6.2 to 7.13, so if you need categorization identical to the default for jobs created in these versions, specify `"tokenizer": "ml_classic"` in your `categorization_analyzer`.
+        externalDocs:
+          url: https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-tokenizers.html
+          description: Tokenizer reference
diff --git a/docs/overlays/elasticsearch-serverless-openapi-overlays.yaml b/docs/overlays/elasticsearch-serverless-openapi-overlays.yaml
@@ -30,9 +30,10 @@ actions:
       x-beta: true
   # Remove and annotate items that are not shown in Bump.sh due to depth limits
   - target: "$.components['schemas']['ml._types:Datafeed'].properties.query"
+    description: Remove query object from ML anomaly detection datafeed
     remove: true
   - target: "$.components['schemas']['ml._types:Datafeed'].properties"
-    description: Re-add a simplified query object
+    description: Re-add a simplified query object in ML anomaly detection datafeed
     update:
       query:
         x-abbreviated: true
@@ -45,4 +46,24 @@ actions:
         externalDocs:
           url: https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html
           description: Query DSL
-        
+  - target: "$.components['schemas']['ml._types:CategorizationAnalyzerDefinition'].properties.tokenizer"
+    description: Remove tokenizer object from ML anomaly detection analysis config
+    remove: true
+  - target: "$.components['schemas']['ml._types:CategorizationAnalyzerDefinition'].properties"
+    description: Re-add a simplified tokenizer object in ML anomaly detection analysis config
+    update:
+      tokenizer:
+        x-abbreviated: true
+        oneOf:
+          - type: object
+          - type: string
+        description: >
+          The name or definition of the tokenizer to use after character filters are applied.
+          This property is compulsory if `categorization_analyzer` is specified as an object.
+          Machine learning provides a tokenizer called `ml_standard` that tokenizes in a way that has been determined to produce good categorization results on a variety of log file formats for logs in English.
+          If you want to use that tokenizer but change the character or token filters, specify `"tokenizer": "ml_standard"` in your `categorization_analyzer`.
+          Additionally, the `ml_classic` tokenizer is available, which tokenizes in the same way as the non-customizable tokenizer in old versions of the product (before 6.2).
+          `ml_classic` was the default categorization tokenizer in versions 6.2 to 7.13, so if you need categorization identical to the default for jobs created in these versions, specify `"tokenizer": "ml_classic"` in your `categorization_analyzer`.
+        externalDocs:
+          url: https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-tokenizers.html
+          description: Tokenizer reference