Skip to content
This repository was archived by the owner on Jun 2, 2025. It is now read-only.

Commit 9da51a6

Browse files
Merge pull request #47 from openclimatefix/issue/gsp-database
Issue/gsp database
2 parents 4e2e144 + c32e1c8 commit 9da51a6

File tree

13 files changed

+376
-87
lines changed

13 files changed

+376
-87
lines changed

ocf_datapipes/load/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
"""Loading datapipes from the raw data"""
2+
from ocf_datapipes.load.gsp.database import OpenGSPFromDatabaseIterDataPipe as OpenGSPFromDatabase
3+
from ocf_datapipes.load.gsp.gsp import OpenGSPIterDataPipe as OpenGSP
24
from ocf_datapipes.load.pv.live import OpenPVFromDBIterDataPipe as OpenPVFromDB
35
from ocf_datapipes.load.pv.pv import OpenPVFromNetCDFIterDataPipe as OpenPVFromNetCDF
46

57
from .configuration import OpenConfigurationIterDataPipe as OpenConfiguration
6-
from .gsp import OpenGSPIterDataPipe as OpenGSP
78
from .nwp import OpenNWPIterDataPipe as OpenNWP
89
from .satellite import OpenSatelliteIterDataPipe as OpenSatellite
910

ocf_datapipes/load/gsp/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
""" Load GSP data from file or database """

ocf_datapipes/load/gsp/database.py

Lines changed: 194 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,194 @@
1+
""" Function to get data from live database """
2+
import logging
3+
import os
4+
from datetime import datetime, timedelta, timezone
5+
from typing import List
6+
7+
import numpy as np
8+
import pandas as pd
9+
import xarray as xr
10+
from nowcasting_datamodel import N_GSP
11+
from nowcasting_datamodel.connection import DatabaseConnection
12+
from nowcasting_datamodel.models.base import Base_Forecast
13+
from nowcasting_datamodel.models.gsp import GSPYield, GSPYieldSQL, Location
14+
from nowcasting_datamodel.read.read_gsp import get_gsp_yield
15+
from torchdata.datapipes import functional_datapipe
16+
from torchdata.datapipes.iter import IterDataPipe
17+
18+
from ocf_datapipes.load.gsp.utils import put_gsp_data_into_an_xr_dataarray
19+
from ocf_datapipes.utils.eso import get_gsp_shape_from_eso
20+
21+
logger = logging.getLogger(__name__)
22+
23+
24+
@functional_datapipe("open_gsp_from_database")
25+
class OpenGSPFromDatabaseIterDataPipe(IterDataPipe):
26+
"""Get and open the GSP data"""
27+
28+
def __init__(
29+
self,
30+
history_minutes: int = 90,
31+
interpolate_minutes: int = 60,
32+
load_extra_minutes: int = 60,
33+
):
34+
"""
35+
Get and open the GSP data
36+
37+
Args:
38+
history_minutes: How many history minutes to use
39+
interpolate_minutes: How many minutes to interpolate
40+
load_extra_minutes: How many extra minutes to load
41+
"""
42+
43+
self.interpolate_minutes = interpolate_minutes
44+
self.load_extra_minutes = load_extra_minutes
45+
self.history_duration = timedelta(minutes=history_minutes)
46+
47+
def __iter__(self) -> xr.DataArray:
48+
"""Get and return GSP data"""
49+
50+
logger.debug("Getting GSP data")
51+
52+
gsp_pv_power_mw_df, gsp_capacity = get_gsp_power_from_database(
53+
history_duration=self.history_duration,
54+
interpolate_minutes=self.interpolate_minutes,
55+
load_extra_minutes=self.load_extra_minutes,
56+
)
57+
58+
# get shape file
59+
gsp_id_to_shape = get_gsp_shape_from_eso(return_filename=False)
60+
61+
# Ensure the centroids have the same GSP ID index as the GSP PV power:
62+
gsp_id_to_shape = gsp_id_to_shape.loc[gsp_pv_power_mw_df.columns]
63+
64+
data_array = put_gsp_data_into_an_xr_dataarray(
65+
gsp_pv_power_mw=gsp_pv_power_mw_df.astype(np.float32),
66+
time_utc=gsp_pv_power_mw_df.index.values,
67+
gsp_id=gsp_pv_power_mw_df.columns,
68+
# TODO: Try using `gsp_id_to_shape.geometry.envelope.centroid`. See issue #76.
69+
x_osgb=gsp_id_to_shape.geometry.centroid.x.astype(np.float32),
70+
y_osgb=gsp_id_to_shape.geometry.centroid.y.astype(np.float32),
71+
capacity_megawatt_power=gsp_capacity.astype(np.float32),
72+
)
73+
74+
del gsp_id_to_shape, gsp_pv_power_mw_df
75+
while True:
76+
yield data_array
77+
78+
79+
def get_gsp_power_from_database(
80+
history_duration: timedelta, interpolate_minutes: int, load_extra_minutes: int
81+
) -> (pd.DataFrame, pd.DataFrame):
82+
"""
83+
Get gsp power from database
84+
85+
Args:
86+
history_duration: a timedelta of how many minutes to load in the past
87+
interpolate_minutes: how many minutes we should interpolate the data froward for
88+
load_extra_minutes: the extra minutes we should load, in order to load more data.
89+
This is because some data from a site lags significantly behind 'now'
90+
91+
Returns:pandas data frame with the following columns pv systems indexes
92+
The index is the datetime
93+
94+
"""
95+
96+
logger.info("Loading GSP data from database")
97+
logger.debug(f"{history_duration=}")
98+
logger.debug(f"{interpolate_minutes=}")
99+
logger.debug(f"{load_extra_minutes=}")
100+
101+
extra_duration = timedelta(minutes=load_extra_minutes)
102+
now = pd.to_datetime(datetime.now(tz=timezone.utc)).floor("30T")
103+
start_utc = now - history_duration
104+
start_utc_extra = start_utc - extra_duration
105+
106+
# create empty dataframe with 30 mins periods
107+
empty_df = pd.DataFrame(
108+
index=pd.date_range(start=start_utc_extra, end=now, freq="30T", tz=timezone.utc)
109+
)
110+
111+
# make database connection
112+
url = os.getenv("DB_URL")
113+
db_connection = DatabaseConnection(url=url, base=Base_Forecast)
114+
115+
with db_connection.get_session() as session:
116+
# We minus 1 second just to make sure we don't that value
117+
gsp_yields: List[GSPYieldSQL] = get_gsp_yield(
118+
session=session,
119+
start_datetime_utc=start_utc_extra - timedelta(seconds=1),
120+
gsp_ids=list(range(1, N_GSP + 1)),
121+
filter_nans=False,
122+
)
123+
124+
logger.debug(f"Found {len(gsp_yields)} GSP yields from the database")
125+
126+
gsp_yields_dict = []
127+
for gsp_yield in gsp_yields:
128+
location = Location.from_orm(gsp_yield.location)
129+
gsp_yield = GSPYield.from_orm(gsp_yield)
130+
131+
gsp_yield_dict = gsp_yield.__dict__
132+
gsp_yield_dict["installed_capacity_mw"] = location.installed_capacity_mw
133+
gsp_yield_dict["solar_generation_mw"] = gsp_yield_dict["solar_generation_kw"] / 1000
134+
gsp_yield_dict["gsp_id"] = location.gsp_id
135+
gsp_yields_dict.append(gsp_yield_dict)
136+
137+
gsp_yields_df = pd.DataFrame(gsp_yields_dict)
138+
gsp_yields_df.fillna(0, inplace=True)
139+
140+
logger.debug(gsp_yields_df.columns)
141+
142+
if len(gsp_yields_df) == 0:
143+
logger.warning("Found no gsp yields, this might cause an error")
144+
else:
145+
logger.debug(f"Found {len(gsp_yields_df)} gsp yields")
146+
147+
if len(gsp_yields_df) == 0:
148+
return pd.DataFrame(columns=["gsp_id"]), pd.DataFrame(columns=["gsp_id"])
149+
150+
# pivot on
151+
gsp_yields_df = gsp_yields_df[
152+
["datetime_utc", "gsp_id", "solar_generation_mw", "installed_capacity_mw"]
153+
]
154+
logger.debug(gsp_yields_df.columns)
155+
logger.debug(gsp_yields_df.index)
156+
gsp_yields_df.drop_duplicates(
157+
["datetime_utc", "gsp_id", "solar_generation_mw"], keep="last", inplace=True
158+
)
159+
logger.debug(gsp_yields_df.columns)
160+
logger.debug(gsp_yields_df.index)
161+
gsp_power_df = gsp_yields_df.pivot(
162+
index="datetime_utc", columns="gsp_id", values="solar_generation_mw"
163+
)
164+
165+
gsp_capacity_df = gsp_yields_df.pivot(
166+
index="datetime_utc", columns="gsp_id", values="installed_capacity_mw"
167+
)
168+
169+
logger.debug(f"{empty_df=}")
170+
logger.debug(f"{gsp_power_df=}")
171+
gsp_power_df = empty_df.join(gsp_power_df)
172+
gsp_capacity_df = empty_df.join(gsp_capacity_df)
173+
174+
# interpolate in between, maximum 'live_interpolate_minutes' mins
175+
# note data is in 30 minutes chunks
176+
limit = int(interpolate_minutes / 30)
177+
if limit > 0:
178+
gsp_power_df.interpolate(
179+
limit=limit, inplace=True, method="cubic", fill_value="extrapolate"
180+
)
181+
gsp_capacity_df.interpolate(
182+
limit=limit, inplace=True, method="cubic", fill_value="extrapolate"
183+
)
184+
185+
# filter out the extra minutes loaded
186+
logger.debug(f"{len(gsp_power_df)} of datetimes before filter on {start_utc}")
187+
gsp_power_df = gsp_power_df[gsp_power_df.index >= start_utc]
188+
gsp_capacity_df = gsp_capacity_df[gsp_capacity_df.index >= start_utc]
189+
logger.debug(f"{len(gsp_power_df)} of datetimes after filter on {start_utc}")
190+
191+
# clip values to 0, this just stops any interpolation going below zero
192+
gsp_power_df.clip(lower=0, inplace=True)
193+
194+
return gsp_power_df, gsp_capacity_df

ocf_datapipes/load/gsp.py renamed to ocf_datapipes/load/gsp/gsp.py

Lines changed: 4 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,13 @@
44
from pathlib import Path
55
from typing import Optional, Union
66

7-
import geopandas as gpd
87
import numpy as np
9-
import pandas as pd
108
import xarray as xr
119
from torchdata.datapipes import functional_datapipe
1210
from torchdata.datapipes.iter import IterDataPipe
1311

12+
from ocf_datapipes.load.gsp.utils import get_gsp_id_to_shape, put_gsp_data_into_an_xr_dataarray
13+
1414
logger = logging.getLogger(__name__)
1515

1616
try:
@@ -60,7 +60,7 @@ def __init__(
6060

6161
def __iter__(self) -> xr.DataArray:
6262
"""Get and return GSP data"""
63-
gsp_id_to_shape = _get_gsp_id_to_shape(
63+
gsp_id_to_shape = get_gsp_id_to_shape(
6464
self.gsp_id_to_region_id_filename, self.sheffield_solar_region_path
6565
)
6666
self._gsp_id_to_shape = gsp_id_to_shape # Save, mostly for plotting to check all is fine!
@@ -78,7 +78,7 @@ def __iter__(self) -> xr.DataArray:
7878

7979
# Ensure the centroids have the same GSP ID index as the GSP PV power:
8080
gsp_id_to_shape = gsp_id_to_shape.loc[gsp_pv_power_mw_ds.gsp_id]
81-
data_array = _put_gsp_data_into_an_xr_dataarray(
81+
data_array = put_gsp_data_into_an_xr_dataarray(
8282
gsp_pv_power_mw=gsp_pv_power_mw_ds.generation_mw.data.astype(np.float32),
8383
time_utc=gsp_pv_power_mw_ds.datetime_gmt.data,
8484
gsp_id=gsp_pv_power_mw_ds.gsp_id.data,
@@ -93,76 +93,3 @@ def __iter__(self) -> xr.DataArray:
9393
del gsp_id_to_shape, gsp_pv_power_mw_ds
9494
while True:
9595
yield data_array
96-
97-
98-
def _get_gsp_id_to_shape(
99-
gsp_id_to_region_id_filename: str, sheffield_solar_region_path: str
100-
) -> gpd.GeoDataFrame:
101-
"""
102-
Get the GSP ID to the shape
103-
104-
Args:
105-
gsp_id_to_region_id_filename: Filename of the mapping file
106-
sheffield_solar_region_path: Path to the region shaps
107-
108-
Returns:
109-
GeoDataFrame containing the mapping from ID to shape
110-
"""
111-
# Load mapping from GSP ID to Sheffield Solar region ID:
112-
gsp_id_to_region_id = pd.read_csv(
113-
gsp_id_to_region_id_filename,
114-
usecols=["gsp_id", "region_id"],
115-
dtype={"gsp_id": np.int64, "region_id": np.int64},
116-
)
117-
118-
# Load Sheffield Solar region shapes (which are already in OSGB36 CRS).
119-
ss_regions = gpd.read_file(sheffield_solar_region_path)
120-
121-
# Merge, so we have a mapping from GSP ID to SS region shape:
122-
gsp_id_to_shape = (
123-
ss_regions.merge(gsp_id_to_region_id, left_on="RegionID", right_on="region_id")
124-
.set_index("gsp_id")[["geometry"]]
125-
.sort_index()
126-
)
127-
128-
# Some GSPs are represented by multiple shapes. To find the correct centroid,
129-
# we need to find the spatial union of those regions, and then find the centroid
130-
# of those spatial unions. `dissolve(by="gsp_id")` groups by "gsp_id" and gets
131-
# the spatial union.
132-
return gsp_id_to_shape.dissolve(by="gsp_id")
133-
134-
135-
def _put_gsp_data_into_an_xr_dataarray(
136-
gsp_pv_power_mw: np.ndarray,
137-
time_utc: np.ndarray,
138-
gsp_id: np.ndarray,
139-
x_osgb: np.ndarray,
140-
y_osgb: np.ndarray,
141-
capacity_megawatt_power: np.ndarray,
142-
) -> xr.DataArray:
143-
"""
144-
Converts the GSP data to Xarray DataArray
145-
146-
Args:
147-
gsp_pv_power_mw: GSP PV Power
148-
time_utc: Time in UTC
149-
gsp_id: Id of the GSPs
150-
x_osgb: OSGB X coordinates
151-
y_osgb: OSGB y coordinates
152-
capacity_megawatt_power: Capacity of each GSP
153-
154-
Returns:
155-
Xarray DataArray of the GSP data
156-
"""
157-
# Convert to xr.DataArray:
158-
data_array = xr.DataArray(
159-
gsp_pv_power_mw,
160-
coords=(("time_utc", time_utc), ("gsp_id", gsp_id)),
161-
name="gsp_pv_power_mw",
162-
)
163-
data_array = data_array.assign_coords(
164-
x_osgb=("gsp_id", x_osgb),
165-
y_osgb=("gsp_id", y_osgb),
166-
capacity_megawatt_power=(("time_utc", "gsp_id"), capacity_megawatt_power),
167-
)
168-
return data_array

ocf_datapipes/load/gsp/utils.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
""" Utils for GSP loading"""
2+
import geopandas as gpd
3+
import numpy as np
4+
import pandas as pd
5+
import xarray as xr
6+
7+
8+
def put_gsp_data_into_an_xr_dataarray(
9+
gsp_pv_power_mw: np.ndarray,
10+
time_utc: np.ndarray,
11+
gsp_id: np.ndarray,
12+
x_osgb: np.ndarray,
13+
y_osgb: np.ndarray,
14+
capacity_megawatt_power: np.ndarray,
15+
) -> xr.DataArray:
16+
"""
17+
Converts the GSP data to Xarray DataArray
18+
19+
Args:
20+
gsp_pv_power_mw: GSP PV Power
21+
time_utc: Time in UTC
22+
gsp_id: Id of the GSPs
23+
x_osgb: OSGB X coordinates
24+
y_osgb: OSGB y coordinates
25+
capacity_megawatt_power: Capacity of each GSP
26+
27+
Returns:
28+
Xarray DataArray of the GSP data
29+
"""
30+
# Convert to xr.DataArray:
31+
data_array = xr.DataArray(
32+
gsp_pv_power_mw,
33+
coords=(("time_utc", time_utc), ("gsp_id", gsp_id)),
34+
name="gsp_pv_power_mw",
35+
)
36+
data_array = data_array.assign_coords(
37+
x_osgb=("gsp_id", x_osgb),
38+
y_osgb=("gsp_id", y_osgb),
39+
capacity_megawatt_power=(("time_utc", "gsp_id"), capacity_megawatt_power),
40+
)
41+
return data_array
42+
43+
44+
def get_gsp_id_to_shape(
45+
gsp_id_to_region_id_filename: str, sheffield_solar_region_path: str
46+
) -> gpd.GeoDataFrame:
47+
"""
48+
Get the GSP ID to the shape
49+
50+
Args:
51+
gsp_id_to_region_id_filename: Filename of the mapping file
52+
sheffield_solar_region_path: Path to the region shaps
53+
54+
Returns:
55+
GeoDataFrame containing the mapping from ID to shape
56+
"""
57+
# Load mapping from GSP ID to Sheffield Solar region ID:
58+
gsp_id_to_region_id = pd.read_csv(
59+
gsp_id_to_region_id_filename,
60+
usecols=["gsp_id", "region_id"],
61+
dtype={"gsp_id": np.int64, "region_id": np.int64},
62+
)
63+
64+
# Load Sheffield Solar region shapes (which are already in OSGB36 CRS).
65+
ss_regions = gpd.read_file(sheffield_solar_region_path)
66+
67+
# Merge, so we have a mapping from GSP ID to SS region shape:
68+
gsp_id_to_shape = (
69+
ss_regions.merge(gsp_id_to_region_id, left_on="RegionID", right_on="region_id")
70+
.set_index("gsp_id")[["geometry"]]
71+
.sort_index()
72+
)
73+
74+
# Some GSPs are represented by multiple shapes. To find the correct centroid,
75+
# we need to find the spatial union of those regions, and then find the centroid
76+
# of those spatial unions. `dissolve(by="gsp_id")` groups by "gsp_id" and gets
77+
# the spatial union.
78+
return gsp_id_to_shape.dissolve(by="gsp_id")

0 commit comments

Comments
 (0)