Documenting that we do not support dots in field names (#1900)

masseyke · web-flow · commit ebc230331b22 · 2022-02-08T13:27:45.000-06:00
Es-hadoop does not support fields with dots in their names (#853). Adding support is likely to cause more problems than it fixes. So this commit documents that we do not support them, and adds a better error message.
diff --git a/docs/src/reference/asciidoc/core/mapping.adoc b/docs/src/reference/asciidoc/core/mapping.adoc
@@ -144,3 +144,10 @@ Explicit or manual mapping should be considered when the defaults need to be ove
 Refer to {es} {ref}/indices-create-index.html[create index] and {ref}/indices-put-mapping.html[mapping] documentation on how to define an index and its types - note that these need to be present *before* data is being uploaded to {es} (otherwise automatic mapping will be used by {es}, if enabled).
 
 TIP: In most cases, {ref}/indices-templates.html[templates] are quite handy as they are automatically applied to new indices created that match the pattern; in other words instead of defining the mapping per index, one can just define the template once and then have it applied to all indices that match its pattern.
+
+[float]
+[[limitations]]
+=== Limitations
+
+{es} allows field names to contain dots ('.'). But {esh} does not support them, and fails when reading or writing fields with dots. Refer to
+ {es} {ref}/dot-expand-processor.html[Dot Expander Processor] for tooling to assist replacing dots in field names.
diff --git a/spark/sql-30/src/itest/scala/org/elasticsearch/spark/integration/AbstractScalaEsSparkSQL.scala b/spark/sql-30/src/itest/scala/org/elasticsearch/spark/integration/AbstractScalaEsSparkSQL.scala
@@ -29,6 +29,7 @@ import java.{util => ju}
 import java.util.concurrent.TimeUnit
 import org.elasticsearch.spark.integration.ScalaUtils.propertiesAsScalaMap
 import org.elasticsearch.spark.rdd.JDKCollectionConvertersCompat.Converters._
+
 import scala.collection.Map
 import scala.collection.mutable.ArrayBuffer
 import org.apache.spark.SparkConf
@@ -92,7 +93,7 @@ import org.apache.spark.sql.SparkSession
 import org.elasticsearch.hadoop.EsAssume
 import org.elasticsearch.hadoop.TestData
 import org.elasticsearch.hadoop.cfg.ConfigurationOptions
-import org.elasticsearch.hadoop.rest.RestUtils
+import org.elasticsearch.hadoop.rest.{EsHadoopParsingException, RestUtils}
 import org.elasticsearch.hadoop.serialization.JsonUtils
 import org.elasticsearch.hadoop.util.EsMajorVersion
 import org.junit.Assert._
@@ -2541,6 +2542,19 @@ class AbstractScalaEsScalaSparkSQL(prefix: String, readMetadata: jl.Boolean, pus
     assertThat(head.getString(0), containsString("Chipotle"))
   }
 
+  /**
+   * Dots in field names are supported by Elasticsearch, but not by es-hadoop. We expect them to fail.
+   */
+  @Test(expected = classOf[SparkException])
+  def testDotsInFieldNames(): Unit = {
+    val index = wrapIndex("dots-in-names-index")
+    val typed = "data"
+    val (target, docPath) = makeTargets(index, typed)
+    RestUtils.postData(docPath, "{\"b\":0,\"e\":{\"f.g\":\"hello\"}}".getBytes("UTF-8"))
+    val df = sqc.read.format("es").load(index)
+    df.count()
+  }
+
   /**
    * Take advantage of the fixed method order and clear out all created indices.
    * The indices will last in Elasticsearch for all parameters of this test suite.
@@ -2587,4 +2601,4 @@ class AbstractScalaEsScalaSparkSQL(prefix: String, readMetadata: jl.Boolean, pus
     val lines = Files.readAllLines(path, StandardCharsets.ISO_8859_1).asScala.toSeq
     sc.parallelize(lines)
   }
-}
+}
diff --git a/spark/sql-30/src/main/scala/org/elasticsearch/spark/sql/RowValueReader.scala b/spark/sql-30/src/main/scala/org/elasticsearch/spark/sql/RowValueReader.scala
@@ -57,7 +57,12 @@ private[sql] trait RowValueReader extends SettingsAware {
     if (pos < 0 || pos >= esRow.values.size) {
       // geo types allow fields which are ignored - need to skip these if they are not part of the schema
       if (pos >= 0 || !currentFieldIsGeo) {
-        throw new EsHadoopIllegalStateException(s"Position for '$sparkRowField' not found in row; typically this is caused by a mapping inconsistency")
+        if (key.toString().contains(".")) {
+          throw new EsHadoopIllegalStateException(
+            s"Found field '$sparkRowField'. Fields containing dots ('.') are not supported in es-hadoop")
+        } else {
+          throw new EsHadoopIllegalStateException(s"Position for '$sparkRowField' not found in row; typically this is caused by a mapping inconsistency")
+        }
       }
     } else {
       esRow.values.update(pos, value)