Skip to content
This repository was archived by the owner on Jun 2, 2025. It is now read-only.

Commit 84f7559

Browse files
committed
Merge commit '4e2e1448252da9bdc6ff0560157ea150bc394edf' into issue/gsp-database
# Conflicts: # ocf_datapipes/load/gsp/gsp.py # ocf_datapipes/production/power_perceiver.py
2 parents 0163952 + 4e2e144 commit 84f7559

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

51 files changed

+643
-595
lines changed

.bumpversion.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
[bumpversion]
22
commit = True
33
tag = True
4-
current_version = 0.2.9
4+
current_version = 0.2.13
55

66
[bumpversion:file:setup.py]
77
search = version="{current_version}"

environment.yml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,18 @@ dependencies:
1212
- xarray
1313
- fsspec
1414
- zarr
15+
- cartopy
16+
- dask
17+
- pyproj
18+
- pyresample
19+
- geopandas
20+
- h5netcdf
1521
pip:
1622
- einops
1723
- pathy
1824
- git+https://github.com/SheffieldSolar/PV_Live-API
1925
- pyaml_env
2026
- nowcasting_datamodel
27+
- gitpython
28+
- tqdm
29+
- bottleneck

ocf_datapipes/convert/gsp.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,9 @@ def __iter__(self) -> NumpyBatch:
2727
BatchKey.gsp: xr_data.values,
2828
BatchKey.gsp_t0_idx: xr_data.attrs["t0_idx"],
2929
BatchKey.gsp_id: xr_data.gsp_id.values,
30-
BatchKey.gsp_capacity_mwp: xr_data.isel(time_utc=0)["capacity_mwp"].values,
30+
BatchKey.gsp_capacity_megawatt_power: xr_data.isel(time_utc=0)[
31+
"capacity_megawatt_power"
32+
].values,
3133
BatchKey.gsp_time_utc: datetime64_to_float(xr_data["time_utc"].values),
3234
}
3335

ocf_datapipes/convert/pv.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ def __iter__(self) -> NumpyBatch:
2929
BatchKey.pv_t0_idx: xr_data.attrs["t0_idx"],
3030
BatchKey.pv_system_row_number: xr_data["pv_system_row_number"].values,
3131
BatchKey.pv_id: xr_data["pv_system_id"].values.astype(np.float32),
32-
BatchKey.pv_capacity_wp: xr_data["capacity_wp"].values,
32+
BatchKey.pv_capacity_watt_power: xr_data["capacity_watt_power"].values,
3333
BatchKey.pv_time_utc: datetime64_to_float(xr_data["time_utc"].values),
3434
BatchKey.pv_x_osgb: xr_data["x_osgb"].values,
3535
BatchKey.pv_y_osgb: xr_data["y_osgb"].values,

ocf_datapipes/load/gsp/gsp.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,15 +78,16 @@ def __iter__(self) -> xr.DataArray:
7878

7979
# Ensure the centroids have the same GSP ID index as the GSP PV power:
8080
gsp_id_to_shape = gsp_id_to_shape.loc[gsp_pv_power_mw_ds.gsp_id]
81-
8281
data_array = put_gsp_data_into_an_xr_dataarray(
8382
gsp_pv_power_mw=gsp_pv_power_mw_ds.generation_mw.data.astype(np.float32),
8483
time_utc=gsp_pv_power_mw_ds.datetime_gmt.data,
8584
gsp_id=gsp_pv_power_mw_ds.gsp_id.data,
8685
# TODO: Try using `gsp_id_to_shape.geometry.envelope.centroid`. See issue #76.
8786
x_osgb=gsp_id_to_shape.geometry.centroid.x.astype(np.float32),
8887
y_osgb=gsp_id_to_shape.geometry.centroid.y.astype(np.float32),
89-
capacity_mwp=gsp_pv_power_mw_ds.installedcapacity_mwp.data.astype(np.float32),
88+
capacity_megawatt_power=gsp_pv_power_mw_ds.installedcapacity_mwp.data.astype( # noqa
89+
np.float32
90+
),
9091
)
9192

9293
del gsp_id_to_shape, gsp_pv_power_mw_ds

ocf_datapipes/load/pv/live.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ def __iter__(self):
8585
pv_power_watts=pv_power, # TODO check this is watts
8686
y_osgb=pv_metadata.y_osgb.astype(np.float32),
8787
x_osgb=pv_metadata.x_osgb.astype(np.float32),
88-
capacity_wp=pv_metadata.capacity_wp,
88+
capacity_watt_power=pv_metadata.capacity_watt_power,
8989
pv_system_row_number=pv_system_row_number,
9090
)
9191

@@ -148,7 +148,7 @@ def get_metadata_from_database(providers: List[str] = None) -> pd.DataFrame:
148148
pv_system_all_df["y_osgb"] = y_osgb
149149

150150
pv_system_all_df["capacity_kw"] = pv_system_all_df["installed_capacity_kw"]
151-
pv_system_all_df["capacity_wp"] = pv_system_all_df["capacity_kw"] * 1000
151+
pv_system_all_df["capacity_watt_power"] = pv_system_all_df["capacity_kw"] * 1000
152152

153153
# sort index
154154
pv_system_all_df = pv_system_all_df.sort_index()

ocf_datapipes/load/pv/pv.py

Lines changed: 23 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -51,21 +51,25 @@ def __iter__(self):
5151
def load_everything_into_ram(pv_power_filename, pv_metadata_filename) -> xr.DataArray:
5252
"""Open AND load PV data into RAM."""
5353
# Load pd.DataFrame of power and pd.Series of capacities:
54-
pv_power_watts, pv_capacity_wp, pv_system_row_number = _load_pv_power_watts_and_capacity_wp(
54+
(
55+
pv_power_watts,
56+
pv_capacity_watt_power,
57+
pv_system_row_number,
58+
) = _load_pv_power_watts_and_capacity_watt_power(
5559
pv_power_filename,
5660
)
5761
pv_metadata = _load_pv_metadata(pv_metadata_filename)
58-
# Ensure pv_metadata, pv_power_watts, and pv_capacity_wp all have the same set of
62+
# Ensure pv_metadata, pv_power_watts, and pv_capacity_watt_power all have the same set of
5963
# PV system IDs, in the same order:
6064
pv_metadata, pv_power_watts = intersection_of_pv_system_ids(pv_metadata, pv_power_watts)
61-
pv_capacity_wp = pv_capacity_wp.loc[pv_power_watts.columns]
65+
pv_capacity_watt_power = pv_capacity_watt_power.loc[pv_power_watts.columns]
6266
pv_system_row_number = pv_system_row_number.loc[pv_power_watts.columns]
6367

6468
data_in_ram = put_pv_data_into_an_xr_dataarray(
6569
pv_power_watts=pv_power_watts,
6670
y_osgb=pv_metadata.y_osgb.astype(np.float32),
6771
x_osgb=pv_metadata.x_osgb.astype(np.float32),
68-
capacity_wp=pv_capacity_wp,
72+
capacity_watt_power=pv_capacity_watt_power,
6973
pv_system_row_number=pv_system_row_number,
7074
)
7175

@@ -77,12 +81,12 @@ def load_everything_into_ram(pv_power_filename, pv_metadata_filename) -> xr.Data
7781
return data_in_ram
7882

7983

80-
def _load_pv_power_watts_and_capacity_wp(
84+
def _load_pv_power_watts_and_capacity_watt_power(
8185
filename: Union[str, Path],
8286
start_date: Optional[datetime.datetime] = None,
8387
end_date: Optional[datetime.datetime] = None,
8488
) -> tuple[pd.DataFrame, pd.Series, pd.Series]:
85-
"""Return pv_power_watts, pv_capacity_wp, pv_system_row_number.
89+
"""Return pv_power_watts, pv_capacity_watt_power, pv_system_row_number.
8690
8791
The capacities and pv_system_row_number are computed across the *entire* dataset,
8892
and so is independent of the `start_date` and `end_date`. This ensures the PV system
@@ -94,7 +98,7 @@ def _load_pv_power_watts_and_capacity_wp(
9498
# Load data in a way that will work in the cloud and locally:
9599
with fsspec.open(filename, mode="rb") as file:
96100
pv_power_ds = xr.open_dataset(file, engine="h5netcdf")
97-
pv_capacity_wp = pv_power_ds.max().to_pandas().astype(np.float32)
101+
pv_capacity_watt_power = pv_power_ds.max().to_pandas().astype(np.float32)
98102
pv_power_watts = pv_power_ds.sel(datetime=slice(start_date, end_date)).to_dataframe()
99103
pv_power_watts = pv_power_watts.astype(np.float32)
100104
del pv_power_ds
@@ -105,14 +109,15 @@ def _load_pv_power_watts_and_capacity_wp(
105109
pv_power_watts.tz_localize("Europe/London").tz_convert("UTC").tz_convert(None)
106110
)
107111

108-
pv_capacity_wp.index = [np.int32(col) for col in pv_capacity_wp.index]
112+
pv_capacity_watt_power.index = [np.int32(col) for col in pv_capacity_watt_power.index]
109113
pv_power_watts.columns = pv_power_watts.columns.astype(np.int64)
110114

111-
# Create pv_system_row_number. We use the index of `pv_capacity_wp` because that includes
115+
# Create pv_system_row_number. We use the index of
116+
# `pv_capacity_watt_power` because that includes
112117
# the PV system IDs for the entire dataset (independent of `start_date` and `end_date`).
113118
# We use `float32` for the ID because we use NaN to indicate a missing PV system,
114119
# or that this whole example doesn't include PV.
115-
all_pv_system_ids = pv_capacity_wp.index
120+
all_pv_system_ids = pv_capacity_watt_power.index
116121
pv_system_row_number = np.arange(start=0, stop=len(all_pv_system_ids), dtype=np.float32)
117122
pv_system_row_number = pd.Series(pv_system_row_number, index=all_pv_system_ids)
118123

@@ -134,7 +139,9 @@ def _load_pv_power_watts_and_capacity_wp(
134139

135140
# Drop any PV systems whose PV capacity is too low:
136141
PV_CAPACITY_THRESHOLD_W = 100
137-
pv_systems_to_drop = pv_capacity_wp.index[pv_capacity_wp <= PV_CAPACITY_THRESHOLD_W]
142+
pv_systems_to_drop = pv_capacity_watt_power.index[
143+
pv_capacity_watt_power <= PV_CAPACITY_THRESHOLD_W
144+
]
138145
pv_systems_to_drop = pv_systems_to_drop.intersection(pv_power_watts.columns)
139146
_log.info(
140147
f"Dropping {len(pv_systems_to_drop)} PV systems because their max power is less than"
@@ -144,7 +151,7 @@ def _load_pv_power_watts_and_capacity_wp(
144151

145152
# Ensure that capacity and pv_system_row_num use the same PV system IDs as the power DF:
146153
pv_system_ids = pv_power_watts.columns
147-
pv_capacity_wp = pv_capacity_wp.loc[pv_system_ids]
154+
pv_capacity_watt_power = pv_capacity_watt_power.loc[pv_system_ids]
148155
pv_system_row_number = pv_system_row_number.loc[pv_system_ids]
149156

150157
_log.info(
@@ -157,58 +164,11 @@ def _load_pv_power_watts_and_capacity_wp(
157164
# Sanity checks:
158165
assert not pv_power_watts.columns.duplicated().any()
159166
assert not pv_power_watts.index.duplicated().any()
160-
assert np.isfinite(pv_capacity_wp).all()
161-
assert (pv_capacity_wp >= 0).all()
167+
assert np.isfinite(pv_capacity_watt_power).all()
168+
assert (pv_capacity_watt_power >= 0).all()
162169
assert np.isfinite(pv_system_row_number).all()
163-
assert np.array_equal(pv_power_watts.columns, pv_capacity_wp.index)
164-
return pv_power_watts, pv_capacity_wp, pv_system_row_number
165-
166-
167-
"""Filtering to be added in a different IterDataPipe
168-
169-
pv_power_watts = pv_power_watts.clip(lower=0, upper=5e7)
170-
# Convert the pv_system_id column names from strings to ints:
171-
pv_power_watts.columns = [np.int32(col) for col in pv_power_watts.columns]
172-
173-
if "passiv" not in filename:
174-
_log.warning("Converting timezone. ARE YOU SURE THAT'S WHAT YOU WANT TO DO?")
175-
pv_power_watts = (
176-
pv_power_watts.tz_localize("Europe/London").tz_convert("UTC").tz_convert(None)
177-
)
178-
179-
pv_power_watts = _drop_pv_systems_which_produce_overnight(pv_power_watts)
180-
181-
# Resample to 5-minutely and interpolate up to 15 minutes ahead.
182-
# TODO: Issue #74: Give users the option to NOT resample (because Perceiver IO
183-
# doesn't need all the data to be perfectly aligned).
184-
pv_power_watts = pv_power_watts.resample("5T").interpolate(method="time", limit=3)
185-
pv_power_watts.dropna(axis="index", how="all", inplace=True)
186-
pv_power_watts.dropna(axis="columns", how="all", inplace=True)
187-
188-
# Drop any PV systems whose PV capacity is too low:
189-
PV_CAPACITY_THRESHOLD_W = 100
190-
pv_systems_to_drop = pv_capacity_wp.index[pv_capacity_wp <= PV_CAPACITY_THRESHOLD_W]
191-
pv_systems_to_drop = pv_systems_to_drop.intersection(pv_power_watts.columns)
192-
_log.info(
193-
f"Dropping {len(pv_systems_to_drop)} PV systems because their max power is less than"
194-
f" {PV_CAPACITY_THRESHOLD_W}"
195-
)
196-
pv_power_watts.drop(columns=pv_systems_to_drop, inplace=True)
197-
198-
# Ensure that capacity and pv_system_row_num use the same PV system IDs as the power DF:
199-
pv_system_ids = pv_power_watts.columns
200-
pv_capacity_wp = pv_capacity_wp.loc[pv_system_ids]
201-
pv_system_row_number = pv_system_row_number.loc[pv_system_ids]
202-
203-
_log.info(
204-
"After filtering & resampling to 5 minutes:"
205-
f" pv_power = {pv_power_watts.values.nbytes / 1e6:,.1f} MBytes."
206-
f" {len(pv_power_watts)} PV power datetimes."
207-
f" {len(pv_power_watts.columns)} PV power PV system IDs."
208-
)
209-
210-
211-
"""
170+
assert np.array_equal(pv_power_watts.columns, pv_capacity_watt_power.index)
171+
return pv_power_watts, pv_capacity_watt_power, pv_system_row_number
212172

213173

214174
# Adapted from nowcasting_dataset.data_sources.pv.pv_data_source

ocf_datapipes/load/pv/utils.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ def put_pv_data_into_an_xr_dataarray(
2525
pv_power_watts: pd.DataFrame,
2626
y_osgb: pd.Series,
2727
x_osgb: pd.Series,
28-
capacity_wp: pd.Series,
28+
capacity_watt_power: pd.Series,
2929
pv_system_row_number: pd.Series,
3030
) -> xr.DataArray:
3131
"""Convert to an xarray DataArray.
@@ -35,7 +35,8 @@ def put_pv_data_into_an_xr_dataarray(
3535
ints), and the index is UTC datetime.
3636
x_osgb: The x location. Index = PV system ID ints.
3737
y_osgb: The y location. Index = PV system ID ints.
38-
capacity_wp: The max power output of each PV system in Watts. Index = PV system ID ints.
38+
capacity_watt_power: The max power output of each PV system in Watts.
39+
Index = PV system ID ints.
3940
pv_system_row_number: The integer position of the PV system in the metadata.
4041
Used to create the PV system ID embedding.
4142
"""
@@ -44,7 +45,7 @@ def put_pv_data_into_an_xr_dataarray(
4445
for name, series in (
4546
("x_osgb", x_osgb),
4647
("y_osgb", y_osgb),
47-
("capacity_wp", capacity_wp),
48+
("capacity_watt_power", capacity_watt_power),
4849
("pv_system_row_number", pv_system_row_number),
4950
):
5051
logger.debug(f"Checking {name}")
@@ -64,7 +65,7 @@ def put_pv_data_into_an_xr_dataarray(
6465
data_array = data_array.assign_coords(
6566
x_osgb=("pv_system_id", x_osgb),
6667
y_osgb=("pv_system_id", y_osgb),
67-
capacity_wp=("pv_system_id", capacity_wp),
68+
capacity_watt_power=("pv_system_id", capacity_watt_power),
6869
pv_system_row_number=("pv_system_id", pv_system_row_number),
6970
)
7071
# Sample period duration is required so PVDownsample transform knows by how much

0 commit comments

Comments
 (0)