Require user to specify which columns to flatten

KonstantinSchubert · ibab · commit d7e295610830 · 2016-05-15T20:40:17.000+02:00
This fixes #20.
diff --git a/root_pandas/readwrite.py b/root_pandas/readwrite.py
@@ -50,6 +50,13 @@ def expand_braces(orig):
     return list(set(res))
 
 
+def get_nonscalar_columns(array):
+    first_row = array[0]
+    bad_cols = np.array([x.ndim != 0 for x in first_row])
+    col_names = np.array(array.dtype.names)
+    bad_names = col_names[bad_cols]
+    return list(bad_names)
+
 def get_matching_variables(branches, patterns, fail=True):
     selected = []
 
@@ -85,9 +92,9 @@ def read_root(paths, key=None, columns=None, ignore=None, chunksize=None, where=
         If this parameter is specified, an iterator is returned that yields DataFrames with `chunksize` rows.
     where: str
         Only rows that match the expression will be read.
-    flatten: bool
-        If set to True, will use root_numpy.stretch to flatten arrays in the root file into individual entries.
-        All arrays specified in the columns must have the same length for this to work.
+    flatten: sequence of str
+        A sequence of column names. Will use root_numpy.stretch to flatten arrays in the specified columns into
+        individual entries. All arrays specified in the columns must have the same length for this to work.
         Be careful if you combine this with chunksize, as chunksize will refer to the number of unflattened entries,
         so you will be iterating over a number of entries that is potentially larger than chunksize.
         The index of each element within its former array will be saved in the __array_index column.
@@ -143,8 +150,19 @@ def read_root(paths, key=None, columns=None, ignore=None, chunksize=None, where=
         for var in ignored:
             all_vars.remove(var)
 
-    def do_flatten(arr):
-        arr_, idx = stretch(arr, return_indices=True)
+    def do_flatten(arr, flatten):
+        if flatten is True:
+            warnings.warn(" The option flatten=True is deprecated. Please specify the branches you would like "
+                          "to flatten in a list: flatten=['foo', 'bar']", FutureWarning)
+            arr_, idx = stretch(arr, return_indices=True)
+        else:
+            nonscalar = get_nonscalar_columns(arr)
+            fields = [x for x in arr.dtype.names if (x not in nonscalar or x in flatten)]
+            will_drop = [x for x in arr.dtype.names if x not in fields]
+            if will_drop:
+                warnings.warn("Ignored the following non-scalar branches: {bad_names}"
+                      .format(bad_names=", ".join(will_drop)), UserWarning)
+            arr_, idx = stretch(arr, fields=fields, return_indices=True)
         arr = append_fields(arr_, '__array_index', idx, usemask=False, asrecarray=True)
         return arr
 
@@ -159,31 +177,22 @@ def genchunks():
             for chunk in range(int(ceil(float(n_entries) / chunksize))):
                 arr = root2array(paths, key, all_vars, start=chunk * chunksize, stop=(chunk+1) * chunksize, selection=where, *args, **kwargs)
                 if flatten:
-                    arr = do_flatten(arr)
+                    arr = do_flatten(arr, flatten)
                 yield convert_to_dataframe(arr)
-
         return genchunks()
 
     arr = root2array(paths, key, all_vars, selection=where, *args, **kwargs)
     if flatten:
-        arr = do_flatten(arr)
+        arr = do_flatten(arr, flatten)
     return convert_to_dataframe(arr)
 
 
 
 def convert_to_dataframe(array):
-
-    def get_nonscalar_columns(array):
-        first_row = array[0]
-        bad_cols = np.array([x.ndim != 0 for x in first_row])
-        col_names = np.array(array.dtype.names)
-        bad_names = col_names[bad_cols]
-        if not bad_names.size == 0:
-            warnings.warn("Ignored the following non-scalar branches: {bad_names}"
-                          .format(bad_names=", ".join(bad_names)), UserWarning)
-        return list(bad_names)
-
     nonscalar_columns = get_nonscalar_columns(array)
+    if nonscalar_columns:
+        warnings.warn("Ignored the following non-scalar branches: {bad_names}"
+                      .format(bad_names=", ".join(nonscalar_columns)), UserWarning)
     indices = list(filter(lambda x: x.startswith('__index__') and x not in nonscalar_columns, array.dtype.names))
     if len(indices) == 0:
         df = DataFrame.from_records(array, exclude=nonscalar_columns)
diff --git a/tests/test.py b/tests/test.py
@@ -6,6 +6,7 @@
 import numpy as np
 import ROOT
 import os
+import warnings
 
 def test_read_write():
     df = pd.DataFrame({'x': [1,2,3]})
@@ -110,34 +111,69 @@ def test_flatten():
 
     length = np.array([3])
     x = np.array([0, 1, 2], dtype='float64')
+    y = np.array([6, 7, 8], dtype='float64')
     tt.Branch('length', length, 'length/I')
     tt.Branch('x', x, 'x[length]/D')
-
+    tt.Branch('y', y, 'y[length]/D')
     tt.Fill()
     x[0] = 3
     x[1] = 4
     x[2] = 5
+    y[0] = 9
+    y[1] = 10
+    y[2] = 11
     tt.Fill()
     
     tf.Write()
     tf.Close()
 
     branches = list_branches('tmp.root')
 
-    df_ = read_root('tmp.root', flatten=True)
 
+    # flatten one out of two array branches
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        df_ = read_root('tmp.root', flatten=['x'])
     assert('__array_index' in df_.columns)
     assert(len(df_) == 6)
+    assert('length' in df_.columns.values)
+    assert('x' in df_.columns.values)
+    assert('y' not in df_.columns.values)
     assert(np.all(df_['__array_index'] == np.array([0, 1, 2, 0, 1, 2])))
+    assert(np.all(df_['x'] == np.array([0, 1, 2, 3, 4, 5])))
 
-    # Also flatten chunked data
 
-    for df_ in read_root('tmp.root', flatten=True, chunksize=1):
+    # flatten both array branches
+    df_ = read_root('tmp.root', flatten=['x','y'])
+    assert('__array_index' in df_.columns)
+    assert(len(df_) == 6)
+    assert(np.all(df_['__array_index'] == np.array([0, 1, 2, 0, 1, 2])))
+    assert('length' in df_.columns.values)
+    assert('x' in df_.columns.values)
+    assert('y' in df_.columns.values)
+    assert(np.all(df_['x'] == np.array([0, 1, 2, 3, 4, 5])))
+    assert(np.all(df_['y'] == np.array([6, 7, 8, 9, 10, 11])))
+
+
+    # Also flatten chunked data
+    for df_ in read_root('tmp.root', flatten=['x'], chunksize=1):
         assert(len(df_) == 3)
         assert(np.all(df_['__array_index'] == np.array([0, 1, 2])))
 
+    # Also test deprecated behaviour
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        df_ = read_root('tmp.root', flatten=True)
+    assert('__array_index' in df_.columns)
+    assert(len(df_) == 6)
+    assert(np.all(df_['__array_index'] == np.array([0, 1, 2, 0, 1, 2])))
+
+
     os.remove('tmp.root')
 
+
+
+
 def test_drop_nonscalar_columns():
     array = np.array([1, 2, 3])
     matrix = np.array([[1, 2, 3], [4, 5, 6]])
@@ -157,11 +193,12 @@ def test_drop_nonscalar_columns():
 
     path = 'tmp.root'
     array2root(arr, path, 'ntuple', mode='recreate')
-
-    df = read_root(path, flatten=False)
-    # the above line throws an error if flatten=True because nonscalar columns
-    # are dropped only after the flattening is applied. However, the flattening
-    # algorithm can not deal with arrays of more than one dimension.
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        df = read_root(path, flatten=False)
+        # the above line throws an error if flatten=True because nonscalar columns
+        # are dropped only after the flattening is applied. However, the flattening
+        # algorithm can not deal with arrays of more than one dimension.
     assert(len(df.columns) == 2)
     assert(np.all(df.index.values == np.array([0, 1])))
     assert(np.all(df.a.values == np.array([3, 2])))