|
1 |
| - |
2 |
| -""" |
3 |
| -A module that extends pandas to support the ROOT data format. |
4 |
| -""" |
5 |
| - |
6 |
| -import numpy as np |
7 |
| -from numpy.lib.recfunctions import append_fields |
8 |
| -from pandas import DataFrame |
9 |
| -from root_numpy import root2array, list_trees |
10 |
| -from fnmatch import fnmatch |
11 |
| -from root_numpy import list_branches |
12 |
| -from root_numpy.extern.six import string_types |
13 |
| -import itertools |
14 |
| -from math import ceil |
15 |
| -import re |
16 |
| -import ROOT |
17 |
| - |
18 |
| -from .utils import stretch |
19 |
| - |
20 |
| - |
21 |
| -__all__ = ['read_root'] |
22 |
| - |
23 |
| - |
24 |
| -def expand_braces(orig): |
25 |
| - r = r'.*(\{.+?[^\\]\})' |
26 |
| - p = re.compile(r) |
27 |
| - |
28 |
| - s = orig[:] |
29 |
| - res = list() |
30 |
| - |
31 |
| - m = p.search(s) |
32 |
| - if m is not None: |
33 |
| - sub = m.group(1) |
34 |
| - open_brace = s.find(sub) |
35 |
| - close_brace = open_brace + len(sub) - 1 |
36 |
| - if sub.find(',') != -1: |
37 |
| - for pat in sub.strip('{}').split(','): |
38 |
| - res.extend(expand_braces(s[:open_brace] + pat + s[close_brace+1:])) |
39 |
| - |
40 |
| - else: |
41 |
| - res.extend(expand_braces(s[:open_brace] + sub.replace('}', '\\}') + s[close_brace+1:])) |
42 |
| - |
43 |
| - else: |
44 |
| - res.append(s.replace('\\}', '}')) |
45 |
| - |
46 |
| - return list(set(res)) |
47 |
| - |
48 |
| - |
49 |
| -def get_matching_variables(branches, patterns, fail=True): |
50 |
| - selected = [] |
51 |
| - |
52 |
| - for p in patterns: |
53 |
| - found = False |
54 |
| - for b in branches: |
55 |
| - if fnmatch(b, p): |
56 |
| - found = True |
57 |
| - if fnmatch(b, p) and b not in selected: |
58 |
| - selected.append(b) |
59 |
| - if not found and fail: |
60 |
| - raise ValueError("Pattern '{}' didn't match any branch".format(p)) |
61 |
| - return selected |
62 |
| - |
63 |
| - |
64 |
| -def read_root(path, key=None, columns=None, ignore=None, chunksize=None, where=None, flatten=False, *args, **kwargs): |
65 |
| - """ |
66 |
| - Read a ROOT file into a pandas DataFrame. |
67 |
| - Further *args and *kwargs are passed to root_numpy's root2array. |
68 |
| - If the root file contains a branch matching __index__*, it will become the DataFrame's index. |
69 |
| -
|
70 |
| - Parameters |
71 |
| - ---------- |
72 |
| - path: string |
73 |
| - The path to the root file. |
74 |
| - key: string |
75 |
| - The key of the tree to load. |
76 |
| - columns: str or sequence of str |
77 |
| - A sequence of shell-patterns (can contain *, ?, [] or {}). Matching columns are read. |
78 |
| - ignore: str or sequence of str |
79 |
| - A sequence of shell-patterns (can contain *, ?, [] or {}). All matching columns are ignored (overriding the columns argument). |
80 |
| - chunksize: int |
81 |
| - If this parameter is specified, an iterator is returned that yields DataFrames with `chunksize` rows. |
82 |
| - where: str |
83 |
| - Only rows that match the expression will be read. |
84 |
| - flatten: bool |
85 |
| - If set to True, will use root_numpy.stretch to flatten arrays in the root file into individual entries. |
86 |
| - All arrays specified in the columns must have the same length for this to work. |
87 |
| - Be careful if you combine this with chunksize, as chunksize will refer to the number of unflattened entries, |
88 |
| - so you will be iterating over a number of entries that is potentially larger than chunksize. |
89 |
| - The index of each element within its former array will be saved in the __array_index column. |
90 |
| -
|
91 |
| - Returns |
92 |
| - ------- |
93 |
| - DataFrame created from matching data in the specified TTree |
94 |
| -
|
95 |
| - Notes |
96 |
| - ----- |
97 |
| -
|
98 |
| - >>> df = read_root('test.root', 'MyTree', columns=['A{B,C}*', 'D'], where='ABB > 100') |
99 |
| -
|
100 |
| - """ |
101 |
| - if not key: |
102 |
| - trees = list_trees(path) |
103 |
| - if len(trees) == 1: |
104 |
| - key = trees[0] |
105 |
| - elif len(trees) == 0: |
106 |
| - raise ValueError('No trees found in {}'.format(path)) |
107 |
| - else: |
108 |
| - raise ValueError('More than one tree found in {}'.format(path)) |
109 |
| - |
110 |
| - branches = list_branches(path, key) |
111 |
| - |
112 |
| - if not columns: |
113 |
| - all_vars = branches |
114 |
| - else: |
115 |
| - if isinstance(columns, string_types): |
116 |
| - columns = [columns] |
117 |
| - # __index__* is always loaded if it exists |
118 |
| - # XXX Figure out what should happen with multi-dimensional indices |
119 |
| - index_branches = filter(lambda x: x.startswith('__index__'), branches) |
120 |
| - if index_branches: |
121 |
| - columns = columns[:] |
122 |
| - columns.append(index_branches[0]) |
123 |
| - columns = list(itertools.chain.from_iterable(list(map(expand_braces, columns)))) |
124 |
| - all_vars = get_matching_variables(branches, columns) |
125 |
| - |
126 |
| - if ignore: |
127 |
| - if isinstance(ignore, string_types): |
128 |
| - ignore = [ignore] |
129 |
| - ignored = get_matching_variables(branches, ignore, fail=False) |
130 |
| - ignored = list(itertools.chain.from_iterable(list(map(expand_braces, ignored)))) |
131 |
| - if any(map(lambda x: x.startswith('__index__', ignored))): |
132 |
| - raise ValueError('__index__* branch is being ignored!') |
133 |
| - for var in ignored: |
134 |
| - all_vars.remove(var) |
135 |
| - |
136 |
| - def do_flatten(arr): |
137 |
| - arr_, idx = stretch(arr, return_indices=True) |
138 |
| - arr = append_fields(arr_, '__array_index', idx, usemask=False, asrecarray=True) |
139 |
| - return arr |
140 |
| - |
141 |
| - if chunksize: |
142 |
| - f = ROOT.TFile.Open(path) |
143 |
| - n_entries = f.Get(key).GetEntries() |
144 |
| - f.Close() |
145 |
| - |
146 |
| - def genchunks(): |
147 |
| - for chunk in range(int(ceil(float(n_entries) / chunksize))): |
148 |
| - arr = root2array(path, key, all_vars, start=chunk * chunksize, stop=(chunk+1) * chunksize, selection=where, *args, **kwargs) |
149 |
| - if flatten: |
150 |
| - arr = do_flatten(arr) |
151 |
| - yield convert_to_dataframe(arr) |
152 |
| - |
153 |
| - return genchunks() |
154 |
| - |
155 |
| - arr = root2array(path, key, all_vars, selection=where, *args, **kwargs) |
156 |
| - if flatten: |
157 |
| - arr = do_flatten(arr) |
158 |
| - return convert_to_dataframe(arr) |
159 |
| - |
160 |
| - |
161 |
| -def convert_to_dataframe(array): |
162 |
| - indices = list(filter(lambda x: x.startswith('__index__'), array.dtype.names)) |
163 |
| - if len(indices) == 0: |
164 |
| - df = DataFrame.from_records(array) |
165 |
| - elif len(indices) == 1: |
166 |
| - # We store the index under the __index__* branch, where |
167 |
| - # * is the name of the index |
168 |
| - df = DataFrame.from_records(array, index=indices[0]) |
169 |
| - index_name = indices[0][len('__index__'):] |
170 |
| - if not index_name: |
171 |
| - # None means the index has no name |
172 |
| - index_name = None |
173 |
| - df.index.name = index_name |
174 |
| - else: |
175 |
| - raise ValueError("More than one index found in file") |
176 |
| - return df |
177 |
| - |
178 |
| - |
179 |
| -def to_root(df, path, key='default', mode='w', *args, **kwargs): |
180 |
| - """ |
181 |
| - Write DataFrame to a ROOT file. |
182 |
| -
|
183 |
| - Parameters |
184 |
| - ---------- |
185 |
| - path: string |
186 |
| - File path to new ROOT file (will be overwritten) |
187 |
| - key: string |
188 |
| - Name of tree that the DataFrame will be saved as |
189 |
| - mode: string, {'w', 'a'} |
190 |
| - Mode that the file should be opened in (default: 'w') |
191 |
| -
|
192 |
| - Notes |
193 |
| - ----- |
194 |
| -
|
195 |
| - Further *args and *kwargs are passed to root_numpy's array2root. |
196 |
| -
|
197 |
| - >>> df = DataFrame({'x': [1,2,3], 'y': [4,5,6]}) |
198 |
| - >>> df.to_root('test.root') |
199 |
| -
|
200 |
| - The DataFrame index will be saved as a branch called '__index__*', |
201 |
| - where * is the name of the index in the original DataFrame |
202 |
| - """ |
203 |
| - |
204 |
| - if mode == 'a': |
205 |
| - mode = 'update' |
206 |
| - elif mode == 'w': |
207 |
| - mode = 'recreate' |
208 |
| - else: |
209 |
| - raise ValueError('Unknown mode: {}. Must be "a" or "w".'.format(mode)) |
210 |
| - |
211 |
| - from root_numpy import array2root |
212 |
| - # We don't want to modify the user's DataFrame here, so we make a shallow copy |
213 |
| - df_ = df.copy(deep=False) |
214 |
| - name = df_.index.name |
215 |
| - if name is None: |
216 |
| - # Handle the case where the index has no name |
217 |
| - name = '' |
218 |
| - df_['__index__' + name] = df_.index |
219 |
| - arr = df_.to_records(index=False) |
220 |
| - array2root(arr, path, key, mode=mode, *args, **kwargs) |
221 |
| - |
222 |
| - |
223 |
| -# Patch pandas DataFrame to support to_root method |
224 |
| -DataFrame.to_root = to_root |
| 1 | +from .readwrite import read_root |
0 commit comments