Support opening spark data frames in the data viewer (#10304)

rchiodo · web-flow · commit 73f94f2a2fff · 2020-02-24T17:12:13.000-08:00
* Support opening spark data frames in the data viewer

* Review feedback
diff --git a/news/2 Fixes/9959.md b/news/2 Fixes/9959.md
@@ -0,0 +1 @@
+Support opening spark dataframes in the data viewer.
diff --git a/pythonFiles/datascience/getJupyterVariableDataFrameInfo.py b/pythonFiles/datascience/getJupyterVariableDataFrameInfo.py
@@ -10,6 +10,22 @@
 # Indexes off of _VSCODE_targetVariable need to index types that are part of IJupyterVariable
 _VSCODE_targetVariable = _VSCODE_json.loads("""_VSCode_JupyterTestValue""")
 
+# Function to compute row count for a value
+def _VSCODE_getRowCount(var):
+    if hasattr(var, "shape"):
+        try:
+            # Get a bit more restrictive with exactly what we want to count as a shape, since anything can define it
+            if isinstance(var.shape, tuple):
+                return var.shape[0]
+        except TypeError:
+            return 0
+    elif hasattr(var, "__len__"):
+        try:
+            return len(var)
+        except TypeError:
+            return 0
+
+
 # First check to see if we are a supported type, this prevents us from adding types that are not supported
 # and also keeps our types in sync with what the variable explorer says that we support
 if _VSCODE_targetVariable["type"] not in _VSCode_supportsDataExplorer:
@@ -21,18 +37,7 @@
     _VSCODE_evalResult = eval(_VSCODE_targetVariable["name"])
 
     # Figure out shape if not already there. Use the shape to compute the row count
-    if hasattr(_VSCODE_evalResult, "shape"):
-        try:
-            # Get a bit more restrictive with exactly what we want to count as a shape, since anything can define it
-            if isinstance(_VSCODE_evalResult.shape, tuple):
-                _VSCODE_targetVariable["rowCount"] = _VSCODE_evalResult.shape[0]
-        except TypeError:
-            _VSCODE_targetVariable["rowCount"] = 0
-    elif hasattr(_VSCODE_evalResult, "__len__"):
-        try:
-            _VSCODE_targetVariable["rowCount"] = len(_VSCODE_evalResult)
-        except TypeError:
-            _VSCODE_targetVariable["rowCount"] = 0
+    _VSCODE_targetVariable["rowCount"] = _VSCODE_getRowCount(_VSCODE_evalResult)
 
     # Turn the eval result into a df
     _VSCODE_df = _VSCODE_evalResult
@@ -45,6 +50,9 @@
         _VSCODE_df = _VSCODE_pd.Series.to_frame(_VSCODE_evalResult)
     elif _VSCODE_targetVariable["type"] == "ndarray":
         _VSCODE_df = _VSCODE_pd.DataFrame(_VSCODE_evalResult)
+    elif hasattr(_VSCODE_df, "toPandas"):
+        _VSCODE_df = _VSCODE_df.toPandas()
+        _VSCODE_targetVariable["rowCount"] = _VSCODE_getRowCount(_VSCODE_df)
 
     # If any rows, use pandas json to convert a single row to json. Extract
     # the column names and types from the json so we match what we'll fetch when
diff --git a/pythonFiles/datascience/getJupyterVariableDataFrameRows.py b/pythonFiles/datascience/getJupyterVariableDataFrameRows.py
@@ -24,6 +24,8 @@
     _VSCODE_df = _VSCODE_pd.Series.to_frame(_VSCODE_evalResult)
 elif _VSCODE_targetVariable["type"] == "ndarray":
     _VSCODE_df = _VSCODE_pd.DataFrame(_VSCODE_evalResult)
+elif hasattr(_VSCODE_df, "toPandas"):
+    _VSCODE_df = _VSCODE_df.toPandas()
 # If not a known type, then just let pandas handle it.
 elif not (hasattr(_VSCODE_df, "iloc")):
     _VSCODE_df = _VSCODE_pd.DataFrame(_VSCODE_evalResult)

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Support opening spark dataframes in the data viewer.`