Implement flattening arrays when reading

ibab · ibab · commit 3b41ae95cb4e · 2016-01-28T17:36:56.000+01:00
This implementation should be enough to fix #7 for now. We can make this more flexible later on, for example allowing the user to flatten different arrays at once, producing all combinations of entries.
diff --git a/root_pandas/__init__.py b/root_pandas/__init__.py
@@ -3,6 +3,8 @@
 A module that extends pandas to support the ROOT data format.
 """
 
+import numpy as np
+from numpy.lib.recfunctions import append_fields
 from pandas import DataFrame
 from root_numpy import root2array, list_trees
 from fnmatch import fnmatch
@@ -13,6 +15,8 @@
 import re
 import ROOT
 
+from .utils import stretch
+
 
 __all__ = ['read_root']
 
@@ -57,7 +61,7 @@ def get_matching_variables(branches, patterns, fail=True):
     return selected
 
 
-def read_root(path, key=None, columns=None, ignore=None, chunksize=None, where=None, *args, **kwargs):
+def read_root(path, key=None, columns=None, ignore=None, chunksize=None, where=None, flatten=False, *args, **kwargs):
     """
     Read a ROOT file into a pandas DataFrame.
     Further *args and *kwargs are passed to root_numpy's root2array.
@@ -66,17 +70,23 @@ def read_root(path, key=None, columns=None, ignore=None, chunksize=None, where=N
     Parameters
     ----------
     path: string
-        The path to the root file
+        The path to the root file.
     key: string
         The key of the tree to load.
     columns: str or sequence of str
         A sequence of shell-patterns (can contain *, ?, [] or {}). Matching columns are read.
     ignore: str or sequence of str
-        A sequence of shell-patterns (can contain *, ?, [] or {}). All matching columns are ignored (overriding the columns argument)
+        A sequence of shell-patterns (can contain *, ?, [] or {}). All matching columns are ignored (overriding the columns argument).
     chunksize: int
-        If this parameter is specified, an iterator is returned that yields DataFrames with `chunksize` rows
+        If this parameter is specified, an iterator is returned that yields DataFrames with `chunksize` rows.
     where: str
-        Only rows that match the expression will be read
+        Only rows that match the expression will be read.
+    flatten: bool
+        If set to True, will use root_numpy.stretch to flatten arrays in the root file into individual entries.
+        All arrays specified in the columns must have the same length for this to work.
+        Be careful if you combine this with chunksize, as chunksize will refer to the number of unflattened entries,
+        so you will be iterating over a number of entries that is potentially larger than chunksize.
+        The index of each element within its former array will be saved in the __array_index column.
 
     Returns
     -------
@@ -89,10 +99,10 @@ def read_root(path, key=None, columns=None, ignore=None, chunksize=None, where=N
 
     """
     if not key:
-        branches = list_trees(path)
-        if len(branches) == 1:
-            key = branches[0]
-        elif len(branches) == 0:
+        trees = list_trees(path)
+        if len(trees) == 1:
+            key = trees[0]
+        elif len(trees) == 0:
             raise ValueError('No trees found in {}'.format(path))
         else:
             raise ValueError('More than one tree found in {}'.format(path))
@@ -123,18 +133,28 @@ def read_root(path, key=None, columns=None, ignore=None, chunksize=None, where=N
         for var in ignored:
             all_vars.remove(var)
 
+    def do_flatten(arr):
+        arr_, idx = stretch(arr, return_indices=True)
+        arr = append_fields(arr_, '__array_index', idx, usemask=False, asrecarray=True)
+        return arr
+
     if chunksize:
-        f = ROOT.TFile(path)
+        f = ROOT.TFile.Open(path)
         n_entries = f.Get(key).GetEntries()
         f.Close()
 
         def genchunks():
             for chunk in range(int(ceil(float(n_entries) / chunksize))):
                 arr = root2array(path, key, all_vars, start=chunk * chunksize, stop=(chunk+1) * chunksize, selection=where, *args, **kwargs)
+                if flatten:
+                    arr = do_flatten(arr)
                 yield convert_to_dataframe(arr)
+
         return genchunks()
 
     arr = root2array(path, key, all_vars, selection=where, *args, **kwargs)
+    if flatten:
+        arr = do_flatten(arr)
     return convert_to_dataframe(arr)
 
 
diff --git a/root_pandas/utils.py b/root_pandas/utils.py
@@ -0,0 +1,108 @@
+# Copyright (c) 2012 rootpy developers and contributors
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy of
+# this software and associated documentation files (the "Software"), to deal in
+# the Software without restriction, including without limitation the rights to
+# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+# the Software, and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+#
+# Code temporarily copied from the root_numpy package
+#
+
+import numpy as np
+VLEN = np.vectorize(len)
+
+def stretch(arr, fields=None, return_indices=False):
+    """Stretch an array.
+    Stretch an array by ``hstack()``-ing  multiple array fields while
+    preserving column names and record array structure. If a scalar field is
+    specified, it will be stretched along with array fields.
+    Parameters
+    ----------
+    arr : NumPy structured or record array
+        The array to be stretched.
+    fields : list of strings, optional (default=None)
+        A list of column names to stretch. If None, then stretch all fields.
+    return_indices : bool, optional (default=False)
+        If True, the array index of each stretched array entry will be
+        returned in addition to the stretched array.
+        This changes the return type of this function to a tuple consisting
+        of a structured array and a numpy int64 array.
+    Returns
+    -------
+    ret : A NumPy structured array
+        The stretched array.
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from root_numpy import stretch
+    >>> arr = np.empty(2, dtype=[('scalar', np.int), ('array', 'O')])
+    >>> arr[0] = (0, np.array([1, 2, 3], dtype=np.float))
+    >>> arr[1] = (1, np.array([4, 5, 6], dtype=np.float))
+    >>> stretch(arr, ['scalar', 'array'])
+    array([(0, 1.0), (0, 2.0), (0, 3.0), (1, 4.0), (1, 5.0), (1, 6.0)],
+        dtype=[('scalar', '<i8'), ('array', '<f8')])
+    """
+    dtype = []
+    len_array = None
+
+    if fields is None:
+        fields = arr.dtype.names
+
+    # Construct dtype and check consistency
+    for field in fields:
+        dt = arr.dtype[field]
+        if dt == 'O' or len(dt.shape):
+            if dt == 'O':
+                # Variable-length array field
+                lengths = VLEN(arr[field])
+            else:
+                lengths = np.repeat(dt.shape[0], arr.shape[0])
+            # Fixed-length array field
+            if len_array is None:
+                len_array = lengths
+            elif not np.array_equal(lengths, len_array):
+                raise ValueError(
+                    "inconsistent lengths of array columns in input")
+            if dt == 'O':
+                dtype.append((field, arr[field][0].dtype))
+            else:
+                dtype.append((field, arr[field].dtype, dt.shape[1:]))
+        else:
+            # Scalar field
+            dtype.append((field, dt))
+
+    if len_array is None:
+        raise RuntimeError("no array column in input")
+
+    # Build stretched output
+    ret = np.empty(np.sum(len_array), dtype=dtype)
+    for field in fields:
+        dt = arr.dtype[field]
+        if dt == 'O' or len(dt.shape) == 1:
+            # Variable-length or 1D fixed-length array field
+            ret[field] = np.hstack(arr[field])
+        elif len(dt.shape):
+            # Multidimensional fixed-length array field
+            ret[field] = np.vstack(arr[field])
+        else:
+            # Scalar field
+            ret[field] = np.repeat(arr[field], len_array)
+
+    if return_indices:
+        idx = np.concatenate(list(map(np.arange, len_array)))
+        return ret, idx
+    
+    return ret
diff --git a/tests/test.py b/tests/test.py
@@ -2,6 +2,8 @@
 from root_pandas import read_root
 from root_numpy import list_branches
 from pandas.util.testing import assert_frame_equal
+import numpy as np
+import ROOT
 import os
 
 def test_read_write():
@@ -39,3 +41,35 @@ def test_chunked_reading():
     assert count == 3
     os.remove('tmp.root')
 
+def test_flatten():
+    tf = ROOT.TFile('tmp.root', 'RECREATE')
+    tt = ROOT.TTree("a", "a")
+
+    length = np.array([3])
+    x = np.array([0, 1, 2], dtype='float64')
+    tt.Branch('length', length, 'length/I')
+    tt.Branch('x', x, 'x[length]/D')
+
+    tt.Fill()
+    x[0] = 3
+    x[1] = 4
+    x[2] = 5
+    tt.Fill()
+    
+    tf.Write()
+    tf.Close()
+
+    branches = list_branches('tmp.root')
+
+    df_ = read_root('tmp.root', flatten=True)
+
+    assert('__array_index' in df_.columns)
+    assert(len(df_) == 6)
+    assert(np.all(df_['__array_index'] == np.array([0, 1, 2, 0, 1, 2])))
+
+    # Also flatten chunked data
+
+    for df_ in read_root('tmp.root', flatten=True, chunksize=1):
+        assert(len(df_) == 3)
+        assert(np.all(df_['__array_index'] == np.array([0, 1, 2])))
+