Use noexpand: prefix for formula branches

alexpearce · ibab · commit d161d5aebb8e · 2016-05-15T20:42:48.000+02:00
root_numpy supports branch names to be formulas, such as 2*sqrt(x), which are parsed by ROOT’s TFormula class. The use of * by root_pandas for branch name matching conflicted with this behaviour. This commit adds the NOEXPAND_PREFIX string, which causes column names to skip the branch name matching mechanisms. Closes #14.
diff --git a/root_pandas/readwrite.py b/root_pandas/readwrite.py
@@ -24,6 +24,8 @@
     'to_root',
 ]
 
+NOEXPAND_PREFIX = 'noexpand:'
+
 
 def expand_braces(orig):
     r = r'.*(\{.+?[^\\]\})'
@@ -72,6 +74,25 @@ def get_matching_variables(branches, patterns, fail=True):
     return selected
 
 
+def filter_noexpand_columns(columns):
+    """Return columns not containing and containing the noexpand prefix.
+
+    Parameters
+    ----------
+    columns: sequence of str
+      A sequence of strings to be split
+
+    Returns
+    -------
+      Two lists, the first containing strings without the noexpand prefix, the
+      second containing those that do with the prefix filtered out.
+    """
+    prefix_len = len(NOEXPAND_PREFIX)
+    noexpand = [c[prefix_len:] for c in columns if c.startswith(NOEXPAND_PREFIX)]
+    other = [c for c in columns if not c.startswith(NOEXPAND_PREFIX)]
+    return other, noexpand
+
+
 def read_root(paths, key=None, columns=None, ignore=None, chunksize=None, where=None, flatten=False, *args, **kwargs):
     """
     Read a ROOT file, or list of ROOT files, into a pandas DataFrame.
@@ -86,6 +107,9 @@ def read_root(paths, key=None, columns=None, ignore=None, chunksize=None, where=
         The key of the tree to load.
     columns: str or sequence of str
         A sequence of shell-patterns (can contain *, ?, [] or {}). Matching columns are read.
+        The columns beginning with `noexpand:` are not interpreted as shell-patterns,
+        allowing formula columns such as `noexpand:2*x`. The column in the returned DataFrame
+        will not have the `noexpand:` prefix.
     ignore: str or sequence of str
         A sequence of shell-patterns (can contain *, ?, [] or {}). All matching columns are ignored (overriding the columns argument).
     chunksize: int
@@ -137,8 +161,9 @@ def read_root(paths, key=None, columns=None, ignore=None, chunksize=None, where=
         if index_branches:
             columns = columns[:]
             columns.append(index_branches[0])
+        columns, noexpand = filter_noexpand_columns(columns)
         columns = list(itertools.chain.from_iterable(list(map(expand_braces, columns))))
-        all_vars = get_matching_variables(branches, columns)
+        all_vars = get_matching_variables(branches, columns) + noexpand
 
     if ignore:
         if isinstance(ignore, string_types):
diff --git a/tests/test.py b/tests/test.py
@@ -204,4 +204,24 @@ def test_drop_nonscalar_columns():
     assert(np.all(df.a.values == np.array([3, 2])))
     assert(np.all(df.d.values == np.array([True, False])))
 
-    os.remove(path)
+    os.remove(path)
+
+def test_noexpand_prefix():
+    xs = np.array([1, 2, 3])
+    df = pd.DataFrame({'x': xs})
+    df.to_root('tmp.root')
+
+    # Not using the prefix should throw, as there's no matching branch name
+    try:
+        df = read_root('tmp.root', columns=['2*x'])
+    except ValueError:
+        pass
+    else:
+        assert False
+
+    # Could also use TMath::Sqrt here
+    df = read_root('tmp.root', columns=['noexpand:2*sqrt(x)'])
+    # Note that the column name shouldn't have the noexpand prefix
+    assert np.all(df['2*sqrt(x)'].values == 2*np.sqrt(xs))
+
+    os.remove('tmp.root')