@@ -51,21 +51,25 @@ def __iter__(self):
51
51
def load_everything_into_ram (pv_power_filename , pv_metadata_filename ) -> xr .DataArray :
52
52
"""Open AND load PV data into RAM."""
53
53
# Load pd.DataFrame of power and pd.Series of capacities:
54
- pv_power_watts , pv_capacity_wp , pv_system_row_number = _load_pv_power_watts_and_capacity_wp (
54
+ (
55
+ pv_power_watts ,
56
+ pv_capacity_watt_power ,
57
+ pv_system_row_number ,
58
+ ) = _load_pv_power_watts_and_capacity_watt_power (
55
59
pv_power_filename ,
56
60
)
57
61
pv_metadata = _load_pv_metadata (pv_metadata_filename )
58
- # Ensure pv_metadata, pv_power_watts, and pv_capacity_wp all have the same set of
62
+ # Ensure pv_metadata, pv_power_watts, and pv_capacity_watt_power all have the same set of
59
63
# PV system IDs, in the same order:
60
64
pv_metadata , pv_power_watts = intersection_of_pv_system_ids (pv_metadata , pv_power_watts )
61
- pv_capacity_wp = pv_capacity_wp .loc [pv_power_watts .columns ]
65
+ pv_capacity_watt_power = pv_capacity_watt_power .loc [pv_power_watts .columns ]
62
66
pv_system_row_number = pv_system_row_number .loc [pv_power_watts .columns ]
63
67
64
68
data_in_ram = put_pv_data_into_an_xr_dataarray (
65
69
pv_power_watts = pv_power_watts ,
66
70
y_osgb = pv_metadata .y_osgb .astype (np .float32 ),
67
71
x_osgb = pv_metadata .x_osgb .astype (np .float32 ),
68
- capacity_wp = pv_capacity_wp ,
72
+ capacity_watt_power = pv_capacity_watt_power ,
69
73
pv_system_row_number = pv_system_row_number ,
70
74
)
71
75
@@ -77,12 +81,12 @@ def load_everything_into_ram(pv_power_filename, pv_metadata_filename) -> xr.Data
77
81
return data_in_ram
78
82
79
83
80
- def _load_pv_power_watts_and_capacity_wp (
84
+ def _load_pv_power_watts_and_capacity_watt_power (
81
85
filename : Union [str , Path ],
82
86
start_date : Optional [datetime .datetime ] = None ,
83
87
end_date : Optional [datetime .datetime ] = None ,
84
88
) -> tuple [pd .DataFrame , pd .Series , pd .Series ]:
85
- """Return pv_power_watts, pv_capacity_wp , pv_system_row_number.
89
+ """Return pv_power_watts, pv_capacity_watt_power , pv_system_row_number.
86
90
87
91
The capacities and pv_system_row_number are computed across the *entire* dataset,
88
92
and so is independent of the `start_date` and `end_date`. This ensures the PV system
@@ -94,7 +98,7 @@ def _load_pv_power_watts_and_capacity_wp(
94
98
# Load data in a way that will work in the cloud and locally:
95
99
with fsspec .open (filename , mode = "rb" ) as file :
96
100
pv_power_ds = xr .open_dataset (file , engine = "h5netcdf" )
97
- pv_capacity_wp = pv_power_ds .max ().to_pandas ().astype (np .float32 )
101
+ pv_capacity_watt_power = pv_power_ds .max ().to_pandas ().astype (np .float32 )
98
102
pv_power_watts = pv_power_ds .sel (datetime = slice (start_date , end_date )).to_dataframe ()
99
103
pv_power_watts = pv_power_watts .astype (np .float32 )
100
104
del pv_power_ds
@@ -105,14 +109,15 @@ def _load_pv_power_watts_and_capacity_wp(
105
109
pv_power_watts .tz_localize ("Europe/London" ).tz_convert ("UTC" ).tz_convert (None )
106
110
)
107
111
108
- pv_capacity_wp .index = [np .int32 (col ) for col in pv_capacity_wp .index ]
112
+ pv_capacity_watt_power .index = [np .int32 (col ) for col in pv_capacity_watt_power .index ]
109
113
pv_power_watts .columns = pv_power_watts .columns .astype (np .int64 )
110
114
111
- # Create pv_system_row_number. We use the index of `pv_capacity_wp` because that includes
115
+ # Create pv_system_row_number. We use the index of
116
+ # `pv_capacity_watt_power` because that includes
112
117
# the PV system IDs for the entire dataset (independent of `start_date` and `end_date`).
113
118
# We use `float32` for the ID because we use NaN to indicate a missing PV system,
114
119
# or that this whole example doesn't include PV.
115
- all_pv_system_ids = pv_capacity_wp .index
120
+ all_pv_system_ids = pv_capacity_watt_power .index
116
121
pv_system_row_number = np .arange (start = 0 , stop = len (all_pv_system_ids ), dtype = np .float32 )
117
122
pv_system_row_number = pd .Series (pv_system_row_number , index = all_pv_system_ids )
118
123
@@ -134,7 +139,9 @@ def _load_pv_power_watts_and_capacity_wp(
134
139
135
140
# Drop any PV systems whose PV capacity is too low:
136
141
PV_CAPACITY_THRESHOLD_W = 100
137
- pv_systems_to_drop = pv_capacity_wp .index [pv_capacity_wp <= PV_CAPACITY_THRESHOLD_W ]
142
+ pv_systems_to_drop = pv_capacity_watt_power .index [
143
+ pv_capacity_watt_power <= PV_CAPACITY_THRESHOLD_W
144
+ ]
138
145
pv_systems_to_drop = pv_systems_to_drop .intersection (pv_power_watts .columns )
139
146
_log .info (
140
147
f"Dropping { len (pv_systems_to_drop )} PV systems because their max power is less than"
@@ -144,7 +151,7 @@ def _load_pv_power_watts_and_capacity_wp(
144
151
145
152
# Ensure that capacity and pv_system_row_num use the same PV system IDs as the power DF:
146
153
pv_system_ids = pv_power_watts .columns
147
- pv_capacity_wp = pv_capacity_wp .loc [pv_system_ids ]
154
+ pv_capacity_watt_power = pv_capacity_watt_power .loc [pv_system_ids ]
148
155
pv_system_row_number = pv_system_row_number .loc [pv_system_ids ]
149
156
150
157
_log .info (
@@ -157,58 +164,11 @@ def _load_pv_power_watts_and_capacity_wp(
157
164
# Sanity checks:
158
165
assert not pv_power_watts .columns .duplicated ().any ()
159
166
assert not pv_power_watts .index .duplicated ().any ()
160
- assert np .isfinite (pv_capacity_wp ).all ()
161
- assert (pv_capacity_wp >= 0 ).all ()
167
+ assert np .isfinite (pv_capacity_watt_power ).all ()
168
+ assert (pv_capacity_watt_power >= 0 ).all ()
162
169
assert np .isfinite (pv_system_row_number ).all ()
163
- assert np .array_equal (pv_power_watts .columns , pv_capacity_wp .index )
164
- return pv_power_watts , pv_capacity_wp , pv_system_row_number
165
-
166
-
167
- """Filtering to be added in a different IterDataPipe
168
-
169
- pv_power_watts = pv_power_watts.clip(lower=0, upper=5e7)
170
- # Convert the pv_system_id column names from strings to ints:
171
- pv_power_watts.columns = [np.int32(col) for col in pv_power_watts.columns]
172
-
173
- if "passiv" not in filename:
174
- _log.warning("Converting timezone. ARE YOU SURE THAT'S WHAT YOU WANT TO DO?")
175
- pv_power_watts = (
176
- pv_power_watts.tz_localize("Europe/London").tz_convert("UTC").tz_convert(None)
177
- )
178
-
179
- pv_power_watts = _drop_pv_systems_which_produce_overnight(pv_power_watts)
180
-
181
- # Resample to 5-minutely and interpolate up to 15 minutes ahead.
182
- # TODO: Issue #74: Give users the option to NOT resample (because Perceiver IO
183
- # doesn't need all the data to be perfectly aligned).
184
- pv_power_watts = pv_power_watts.resample("5T").interpolate(method="time", limit=3)
185
- pv_power_watts.dropna(axis="index", how="all", inplace=True)
186
- pv_power_watts.dropna(axis="columns", how="all", inplace=True)
187
-
188
- # Drop any PV systems whose PV capacity is too low:
189
- PV_CAPACITY_THRESHOLD_W = 100
190
- pv_systems_to_drop = pv_capacity_wp.index[pv_capacity_wp <= PV_CAPACITY_THRESHOLD_W]
191
- pv_systems_to_drop = pv_systems_to_drop.intersection(pv_power_watts.columns)
192
- _log.info(
193
- f"Dropping {len(pv_systems_to_drop)} PV systems because their max power is less than"
194
- f" {PV_CAPACITY_THRESHOLD_W}"
195
- )
196
- pv_power_watts.drop(columns=pv_systems_to_drop, inplace=True)
197
-
198
- # Ensure that capacity and pv_system_row_num use the same PV system IDs as the power DF:
199
- pv_system_ids = pv_power_watts.columns
200
- pv_capacity_wp = pv_capacity_wp.loc[pv_system_ids]
201
- pv_system_row_number = pv_system_row_number.loc[pv_system_ids]
202
-
203
- _log.info(
204
- "After filtering & resampling to 5 minutes:"
205
- f" pv_power = {pv_power_watts.values.nbytes / 1e6:,.1f} MBytes."
206
- f" {len(pv_power_watts)} PV power datetimes."
207
- f" {len(pv_power_watts.columns)} PV power PV system IDs."
208
- )
209
-
210
-
211
- """
170
+ assert np .array_equal (pv_power_watts .columns , pv_capacity_watt_power .index )
171
+ return pv_power_watts , pv_capacity_watt_power , pv_system_row_number
212
172
213
173
214
174
# Adapted from nowcasting_dataset.data_sources.pv.pv_data_source
0 commit comments