Exclude columns that are not one-dimensional (#19)

KonstantinSchubert · ibab · commit 28f2097ef22a · 2016-05-07T20:45:32.000+01:00
Exclude non-scalar columns and print a warning
diff --git a/root_pandas/readwrite.py b/root_pandas/readwrite.py
@@ -14,6 +14,7 @@
 from math import ceil
 import re
 import ROOT
+import warnings
 
 from .utils import stretch
 
@@ -169,14 +170,27 @@ def genchunks():
     return convert_to_dataframe(arr)
 
 
+
 def convert_to_dataframe(array):
-    indices = list(filter(lambda x: x.startswith('__index__'), array.dtype.names))
+
+    def get_nonscalar_columns(array):
+        first_row = array[0]
+        bad_cols = np.array([x.ndim != 0 for x in first_row])
+        col_names = np.array(array.dtype.names)
+        bad_names = col_names[bad_cols]
+        if not bad_names.size == 0:
+            warnings.warn("Ignored the following non-scalar branches: {bad_names}"
+                          .format(bad_names=", ".join(bad_names)), UserWarning)
+        return list(bad_names)
+
+    nonscalar_columns = get_nonscalar_columns(array)
+    indices = list(filter(lambda x: x.startswith('__index__') and x not in nonscalar_columns, array.dtype.names))
     if len(indices) == 0:
-        df = DataFrame.from_records(array)
+        df = DataFrame.from_records(array, exclude=nonscalar_columns)
     elif len(indices) == 1:
         # We store the index under the __index__* branch, where
         # * is the name of the index
-        df = DataFrame.from_records(array, index=indices[0])
+        df = DataFrame.from_records(array, index=indices[0], exclude=nonscalar_columns)
         index_name = indices[0][len('__index__'):]
         if not index_name:
             # None means the index has no name
diff --git a/tests/test.py b/tests/test.py
@@ -1,6 +1,7 @@
 import pandas as pd
 from root_pandas import read_root
 from root_numpy import list_branches
+from root_numpy import array2root
 from pandas.util.testing import assert_frame_equal
 import numpy as np
 import ROOT
@@ -137,3 +138,33 @@ def test_flatten():
 
     os.remove('tmp.root')
 
+def test_drop_nonscalar_columns():
+    array = np.array([1, 2, 3])
+    matrix = np.array([[1, 2, 3], [4, 5, 6]])
+    bool_matrix = np.array([[True, False, True], [True, True, True]])
+
+    dt = np.dtype([
+        ('a', 'i4'),
+        ('b', 'int64', array.shape),
+        ('c', 'int64', matrix.shape),
+        ('d', 'bool_'),
+        ('e', 'bool_', matrix.shape)
+        ])
+    arr = np.array([
+        (3, array, matrix, True, bool_matrix),
+        (2, array, matrix, False, bool_matrix)],
+        dtype=dt)
+
+    path = 'tmp.root'
+    array2root(arr, path, 'ntuple', mode='recreate')
+
+    df = read_root(path, flatten=False)
+    # the above line throws an error if flatten=True because nonscalar columns
+    # are dropped only after the flattening is applied. However, the flattening
+    # algorithm can not deal with arrays of more than one dimension.
+    assert(len(df.columns) == 2)
+    assert(np.all(df.index.values == np.array([0, 1])))
+    assert(np.all(df.a.values == np.array([3, 2])))
+    assert(np.all(df.d.values == np.array([True, False])))
+
+    os.remove(path)