Skip to content
This repository was archived by the owner on Jan 9, 2023. It is now read-only.

Commit 45ab34d

Browse files
committed
Move implementation to its own file
1 parent 3b41ae9 commit 45ab34d

File tree

2 files changed

+225
-224
lines changed

2 files changed

+225
-224
lines changed

root_pandas/__init__.py

Lines changed: 1 addition & 224 deletions
Original file line numberDiff line numberDiff line change
@@ -1,224 +1 @@
1-
2-
"""
3-
A module that extends pandas to support the ROOT data format.
4-
"""
5-
6-
import numpy as np
7-
from numpy.lib.recfunctions import append_fields
8-
from pandas import DataFrame
9-
from root_numpy import root2array, list_trees
10-
from fnmatch import fnmatch
11-
from root_numpy import list_branches
12-
from root_numpy.extern.six import string_types
13-
import itertools
14-
from math import ceil
15-
import re
16-
import ROOT
17-
18-
from .utils import stretch
19-
20-
21-
__all__ = ['read_root']
22-
23-
24-
def expand_braces(orig):
25-
r = r'.*(\{.+?[^\\]\})'
26-
p = re.compile(r)
27-
28-
s = orig[:]
29-
res = list()
30-
31-
m = p.search(s)
32-
if m is not None:
33-
sub = m.group(1)
34-
open_brace = s.find(sub)
35-
close_brace = open_brace + len(sub) - 1
36-
if sub.find(',') != -1:
37-
for pat in sub.strip('{}').split(','):
38-
res.extend(expand_braces(s[:open_brace] + pat + s[close_brace+1:]))
39-
40-
else:
41-
res.extend(expand_braces(s[:open_brace] + sub.replace('}', '\\}') + s[close_brace+1:]))
42-
43-
else:
44-
res.append(s.replace('\\}', '}'))
45-
46-
return list(set(res))
47-
48-
49-
def get_matching_variables(branches, patterns, fail=True):
50-
selected = []
51-
52-
for p in patterns:
53-
found = False
54-
for b in branches:
55-
if fnmatch(b, p):
56-
found = True
57-
if fnmatch(b, p) and b not in selected:
58-
selected.append(b)
59-
if not found and fail:
60-
raise ValueError("Pattern '{}' didn't match any branch".format(p))
61-
return selected
62-
63-
64-
def read_root(path, key=None, columns=None, ignore=None, chunksize=None, where=None, flatten=False, *args, **kwargs):
65-
"""
66-
Read a ROOT file into a pandas DataFrame.
67-
Further *args and *kwargs are passed to root_numpy's root2array.
68-
If the root file contains a branch matching __index__*, it will become the DataFrame's index.
69-
70-
Parameters
71-
----------
72-
path: string
73-
The path to the root file.
74-
key: string
75-
The key of the tree to load.
76-
columns: str or sequence of str
77-
A sequence of shell-patterns (can contain *, ?, [] or {}). Matching columns are read.
78-
ignore: str or sequence of str
79-
A sequence of shell-patterns (can contain *, ?, [] or {}). All matching columns are ignored (overriding the columns argument).
80-
chunksize: int
81-
If this parameter is specified, an iterator is returned that yields DataFrames with `chunksize` rows.
82-
where: str
83-
Only rows that match the expression will be read.
84-
flatten: bool
85-
If set to True, will use root_numpy.stretch to flatten arrays in the root file into individual entries.
86-
All arrays specified in the columns must have the same length for this to work.
87-
Be careful if you combine this with chunksize, as chunksize will refer to the number of unflattened entries,
88-
so you will be iterating over a number of entries that is potentially larger than chunksize.
89-
The index of each element within its former array will be saved in the __array_index column.
90-
91-
Returns
92-
-------
93-
DataFrame created from matching data in the specified TTree
94-
95-
Notes
96-
-----
97-
98-
>>> df = read_root('test.root', 'MyTree', columns=['A{B,C}*', 'D'], where='ABB > 100')
99-
100-
"""
101-
if not key:
102-
trees = list_trees(path)
103-
if len(trees) == 1:
104-
key = trees[0]
105-
elif len(trees) == 0:
106-
raise ValueError('No trees found in {}'.format(path))
107-
else:
108-
raise ValueError('More than one tree found in {}'.format(path))
109-
110-
branches = list_branches(path, key)
111-
112-
if not columns:
113-
all_vars = branches
114-
else:
115-
if isinstance(columns, string_types):
116-
columns = [columns]
117-
# __index__* is always loaded if it exists
118-
# XXX Figure out what should happen with multi-dimensional indices
119-
index_branches = filter(lambda x: x.startswith('__index__'), branches)
120-
if index_branches:
121-
columns = columns[:]
122-
columns.append(index_branches[0])
123-
columns = list(itertools.chain.from_iterable(list(map(expand_braces, columns))))
124-
all_vars = get_matching_variables(branches, columns)
125-
126-
if ignore:
127-
if isinstance(ignore, string_types):
128-
ignore = [ignore]
129-
ignored = get_matching_variables(branches, ignore, fail=False)
130-
ignored = list(itertools.chain.from_iterable(list(map(expand_braces, ignored))))
131-
if any(map(lambda x: x.startswith('__index__', ignored))):
132-
raise ValueError('__index__* branch is being ignored!')
133-
for var in ignored:
134-
all_vars.remove(var)
135-
136-
def do_flatten(arr):
137-
arr_, idx = stretch(arr, return_indices=True)
138-
arr = append_fields(arr_, '__array_index', idx, usemask=False, asrecarray=True)
139-
return arr
140-
141-
if chunksize:
142-
f = ROOT.TFile.Open(path)
143-
n_entries = f.Get(key).GetEntries()
144-
f.Close()
145-
146-
def genchunks():
147-
for chunk in range(int(ceil(float(n_entries) / chunksize))):
148-
arr = root2array(path, key, all_vars, start=chunk * chunksize, stop=(chunk+1) * chunksize, selection=where, *args, **kwargs)
149-
if flatten:
150-
arr = do_flatten(arr)
151-
yield convert_to_dataframe(arr)
152-
153-
return genchunks()
154-
155-
arr = root2array(path, key, all_vars, selection=where, *args, **kwargs)
156-
if flatten:
157-
arr = do_flatten(arr)
158-
return convert_to_dataframe(arr)
159-
160-
161-
def convert_to_dataframe(array):
162-
indices = list(filter(lambda x: x.startswith('__index__'), array.dtype.names))
163-
if len(indices) == 0:
164-
df = DataFrame.from_records(array)
165-
elif len(indices) == 1:
166-
# We store the index under the __index__* branch, where
167-
# * is the name of the index
168-
df = DataFrame.from_records(array, index=indices[0])
169-
index_name = indices[0][len('__index__'):]
170-
if not index_name:
171-
# None means the index has no name
172-
index_name = None
173-
df.index.name = index_name
174-
else:
175-
raise ValueError("More than one index found in file")
176-
return df
177-
178-
179-
def to_root(df, path, key='default', mode='w', *args, **kwargs):
180-
"""
181-
Write DataFrame to a ROOT file.
182-
183-
Parameters
184-
----------
185-
path: string
186-
File path to new ROOT file (will be overwritten)
187-
key: string
188-
Name of tree that the DataFrame will be saved as
189-
mode: string, {'w', 'a'}
190-
Mode that the file should be opened in (default: 'w')
191-
192-
Notes
193-
-----
194-
195-
Further *args and *kwargs are passed to root_numpy's array2root.
196-
197-
>>> df = DataFrame({'x': [1,2,3], 'y': [4,5,6]})
198-
>>> df.to_root('test.root')
199-
200-
The DataFrame index will be saved as a branch called '__index__*',
201-
where * is the name of the index in the original DataFrame
202-
"""
203-
204-
if mode == 'a':
205-
mode = 'update'
206-
elif mode == 'w':
207-
mode = 'recreate'
208-
else:
209-
raise ValueError('Unknown mode: {}. Must be "a" or "w".'.format(mode))
210-
211-
from root_numpy import array2root
212-
# We don't want to modify the user's DataFrame here, so we make a shallow copy
213-
df_ = df.copy(deep=False)
214-
name = df_.index.name
215-
if name is None:
216-
# Handle the case where the index has no name
217-
name = ''
218-
df_['__index__' + name] = df_.index
219-
arr = df_.to_records(index=False)
220-
array2root(arr, path, key, mode=mode, *args, **kwargs)
221-
222-
223-
# Patch pandas DataFrame to support to_root method
224-
DataFrame.to_root = to_root
1+
from .readwrite import read_root

0 commit comments

Comments
 (0)