refactor: seperate system into single Dockerfiles
This commit is contained in:
parent
ad87f702f1
commit
ed803a2ca5
26 changed files with 238 additions and 85 deletions
0
packages/common/common/__init__.py
Normal file
0
packages/common/common/__init__.py
Normal file
1
packages/common/common/collectors/__init__.py
Normal file
1
packages/common/common/collectors/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
|||
"A Python package to get hourly energy prices and weather"
|
||||
58
packages/common/common/collectors/smard.py
Normal file
58
packages/common/common/collectors/smard.py
Normal file
|
|
@ -0,0 +1,58 @@
|
|||
"""
|
||||
Collector for SMARD (Electricity Market Data) API.
|
||||
"""
|
||||
|
||||
import time
|
||||
import logging
|
||||
import polars as pl
|
||||
from ..utils import request_utils
|
||||
from ..utils.config_loader import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def fetch_smard_data(filter_id: int, region: str = None) -> pl.DataFrame:
|
||||
"""
|
||||
Fetches time-series data from the SMARD API for a specific filter.
|
||||
|
||||
Args:
|
||||
filter_id: The SMARD data identifier (e.g., 4169 for price).
|
||||
region: The region code (defaults to config value).
|
||||
|
||||
Returns:
|
||||
A Polars DataFrame with 'timestamp' (ms) and 'value'.
|
||||
"""
|
||||
region = region or settings.smard.region
|
||||
base_url = settings.smard.base_url
|
||||
|
||||
now_ms = int(time.time()) * 1000
|
||||
index_url = f"{base_url}/{filter_id}/{region}/index_hour.json"
|
||||
|
||||
try:
|
||||
index_data = request_utils.make_requests(index_url)
|
||||
if not index_data or "timestamps" not in index_data:
|
||||
logger.warning(f"No index data for filter {filter_id}")
|
||||
return pl.DataFrame()
|
||||
|
||||
closest_ts = min(index_data["timestamps"], key=lambda x: abs(x - now_ms))
|
||||
|
||||
data_url = (
|
||||
f"{base_url}/{filter_id}/{region}/"
|
||||
f"{filter_id}_{region}_hour_{closest_ts}.json"
|
||||
)
|
||||
payload = request_utils.make_requests(data_url)
|
||||
series = payload.get("series", [])
|
||||
|
||||
clean_series = [row for row in series if row[1] is not None]
|
||||
if not clean_series:
|
||||
return pl.DataFrame()
|
||||
|
||||
return pl.DataFrame(
|
||||
{
|
||||
"timestamp": [row[0] for row in clean_series],
|
||||
"value": [row[1] for row in clean_series],
|
||||
},
|
||||
schema={"timestamp": pl.Int64, "value": pl.Float64}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to fetch SMARD filter {filter_id}: {e}")
|
||||
return pl.DataFrame()
|
||||
62
packages/common/common/collectors/weather.py
Normal file
62
packages/common/common/collectors/weather.py
Normal file
|
|
@ -0,0 +1,62 @@
|
|||
"""
|
||||
Collector for Bright Sky (Weather) API.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from datetime import datetime, timedelta, timezone
|
||||
|
||||
import polars as pl
|
||||
from ..utils import request_utils
|
||||
from ..utils.config_loader import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def fetch_weather(lat: float = None, lon: float = None) -> pl.DataFrame:
|
||||
"""
|
||||
Fetches historical weather data from Bright Sky API.
|
||||
Filters for the specific columns defined in the Bronze schema.
|
||||
|
||||
Returns:
|
||||
A Polars DataFrame containing filtered raw weather parameters.
|
||||
"""
|
||||
lat = lat or settings.brightsky.lat
|
||||
lon = lon or settings.brightsky.lon
|
||||
url = settings.brightsky.base_url
|
||||
|
||||
current_utc = datetime.now(timezone.utc)
|
||||
payload = {
|
||||
"date": (current_utc - timedelta(hours=72)).isoformat(),
|
||||
"last_date": current_utc.isoformat(),
|
||||
"lat": lat,
|
||||
"lon": lon,
|
||||
"units": "dwd",
|
||||
"tz": "Etc/UTC",
|
||||
}
|
||||
|
||||
schema_cols = [
|
||||
"timestamp",
|
||||
"temperature",
|
||||
"wind_speed",
|
||||
"solar",
|
||||
"sunshine",
|
||||
"cloud_cover",
|
||||
"precipitation",
|
||||
]
|
||||
|
||||
try:
|
||||
data = request_utils.make_requests(url, params=payload)
|
||||
weather_list = data.get("weather", [])
|
||||
|
||||
if not weather_list:
|
||||
logger.warning("No weather data returned from API.")
|
||||
return pl.DataFrame()
|
||||
|
||||
df = pl.DataFrame(weather_list)
|
||||
|
||||
available_cols = [c for c in schema_cols if c in df.columns]
|
||||
return df.select(available_cols)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to fetch weather data: {e}")
|
||||
return pl.DataFrame()
|
||||
12
packages/common/common/config/config.yaml
Normal file
12
packages/common/common/config/config.yaml
Normal file
|
|
@ -0,0 +1,12 @@
|
|||
smard:
|
||||
region: "DE-LU"
|
||||
filter_id: 4169
|
||||
base_url: "https://www.smard.de/app/chart_data"
|
||||
|
||||
brightsky:
|
||||
lat: 52.52
|
||||
lon: 13.41
|
||||
base_url: "https://api.brightsky.dev/weather"
|
||||
|
||||
database:
|
||||
path: "output/pipeline.duckdb"
|
||||
1
packages/common/common/transformators/__init__.py
Normal file
1
packages/common/common/transformators/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
|||
"module to transform and join dataframes"
|
||||
40
packages/common/common/transformators/transformator.py
Normal file
40
packages/common/common/transformators/transformator.py
Normal file
|
|
@ -0,0 +1,40 @@
|
|||
"""
|
||||
Data transformation logic for price and weather data.
|
||||
"""
|
||||
|
||||
from datetime import timedelta
|
||||
import polars as pl
|
||||
|
||||
def transform_weather(df: pl.DataFrame) -> pl.DataFrame:
|
||||
"""
|
||||
Cleans and standardizes raw weather data.
|
||||
"""
|
||||
if df.is_empty():
|
||||
return pl.DataFrame()
|
||||
|
||||
relevant_cols = [
|
||||
"timestamp", "temperature", "wind_speed", "solar",
|
||||
"sunshine", "cloud_cover", "precipitation"
|
||||
]
|
||||
|
||||
# Filter for existing relevant columns
|
||||
cols_to_keep = [c for c in relevant_cols if c in df.columns]
|
||||
|
||||
return df.select(cols_to_keep).with_columns(
|
||||
pl.col("timestamp")
|
||||
.str.to_datetime(format="%Y-%m-%dT%H:%M:%S%z", time_zone="UTC")
|
||||
.cast(pl.Datetime("ms", time_zone="UTC"))
|
||||
).sort("timestamp")
|
||||
|
||||
def transform_prices(df: pl.DataFrame) -> pl.DataFrame:
|
||||
"""
|
||||
Transforms raw SMARD timestamps to UTC Datetime.
|
||||
"""
|
||||
if df.is_empty():
|
||||
return pl.DataFrame()
|
||||
|
||||
return df.with_columns(
|
||||
pl.col("timestamp")
|
||||
.cast(pl.Datetime("ms"))
|
||||
.dt.replace_time_zone("UTC")
|
||||
).sort("timestamp")
|
||||
1
packages/common/common/utils/__init__.py
Normal file
1
packages/common/common/utils/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
|||
"module with a set of utility functions"
|
||||
61
packages/common/common/utils/config_loader.py
Normal file
61
packages/common/common/utils/config_loader.py
Normal file
|
|
@ -0,0 +1,61 @@
|
|||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
from pydantic import BaseModel
|
||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||
|
||||
|
||||
class SmardConfig(BaseModel):
|
||||
region: str = "DE-LU"
|
||||
price_filter: int = 4169
|
||||
load_forecast_filter: int = 4382
|
||||
generation_total_filter: int = 122
|
||||
wind_onshore_filter: int = 4069
|
||||
wind_offshore_filter: int = 4068
|
||||
pv_filter: int = 4070
|
||||
base_url: str = "https://www.smard.de/app/chart_data"
|
||||
|
||||
|
||||
class BrightSkyConfig(BaseModel):
|
||||
lat: float = 52.52
|
||||
lon: float = 13.41
|
||||
base_url: str = "https://api.brightsky.dev/weather"
|
||||
|
||||
|
||||
class DatabaseConfig(BaseModel):
|
||||
path: str = "output/pipeline.duckdb"
|
||||
|
||||
|
||||
class Settings(BaseSettings):
|
||||
model_config = SettingsConfigDict(
|
||||
env_file=".env",
|
||||
env_nested_delimiter="__",
|
||||
env_prefix="STROM_",
|
||||
extra="ignore"
|
||||
)
|
||||
|
||||
smard: SmardConfig = SmardConfig()
|
||||
brightsky: BrightSkyConfig = BrightSkyConfig()
|
||||
database: DatabaseConfig = DatabaseConfig()
|
||||
|
||||
|
||||
def load_config(config_path: str | None = None) -> Settings:
|
||||
if config_path:
|
||||
path = Path(config_path)
|
||||
else:
|
||||
# Try local first (dev) then package relative
|
||||
local_path = Path("config/config.yaml")
|
||||
pkg_path = Path(__file__).parent.parent / "config" / "config.yaml"
|
||||
path = local_path if local_path.exists() else pkg_path
|
||||
|
||||
if not path.exists():
|
||||
return Settings()
|
||||
|
||||
with open(path, "r") as f:
|
||||
config_data = yaml.safe_load(f) or {}
|
||||
|
||||
return Settings(**config_data)
|
||||
|
||||
|
||||
# Global settings instance
|
||||
settings = load_config()
|
||||
90
packages/common/common/utils/database.py
Normal file
90
packages/common/common/utils/database.py
Normal file
|
|
@ -0,0 +1,90 @@
|
|||
"""
|
||||
DuckDB database interface for Bronze (Raw) and Gold (Combined) layers.
|
||||
"""
|
||||
|
||||
import duckdb
|
||||
import polars as pl
|
||||
from contextlib import contextmanager
|
||||
from .config_loader import settings
|
||||
|
||||
|
||||
@contextmanager
|
||||
def get_connection(db_path: str = None):
|
||||
"""Context manager for DuckDB connections."""
|
||||
db_path = db_path or settings.database.path
|
||||
con = duckdb.connect(db_path)
|
||||
try:
|
||||
yield con
|
||||
finally:
|
||||
con.close()
|
||||
|
||||
|
||||
def init_tables(con: duckdb.DuckDBPyConnection):
|
||||
"""
|
||||
Initializes the database schema following the Medallion architecture.
|
||||
"""
|
||||
# BRONZE LAYER (Raw API responses)
|
||||
smard_tables = [
|
||||
"prices_raw",
|
||||
"load_forecast_raw",
|
||||
"gen_total_raw",
|
||||
"wind_onshore_raw",
|
||||
"wind_offshore_raw",
|
||||
"pv_raw",
|
||||
]
|
||||
for table in smard_tables:
|
||||
con.execute(
|
||||
f"CREATE TABLE IF NOT EXISTS {table} (timestamp BIGINT PRIMARY KEY, value DOUBLE)"
|
||||
)
|
||||
|
||||
con.execute("""
|
||||
CREATE TABLE IF NOT EXISTS weather_raw (
|
||||
timestamp VARCHAR PRIMARY KEY,
|
||||
temperature DOUBLE,
|
||||
wind_speed DOUBLE,
|
||||
solar DOUBLE,
|
||||
sunshine DOUBLE,
|
||||
cloud_cover DOUBLE,
|
||||
precipitation DOUBLE
|
||||
)
|
||||
""")
|
||||
|
||||
# GOLD LAYER (Transformed and Joined Business Data)
|
||||
con.execute("""
|
||||
CREATE TABLE IF NOT EXISTS combined (
|
||||
timestamp TIMESTAMP PRIMARY KEY,
|
||||
price DOUBLE,
|
||||
load_forecast DOUBLE,
|
||||
generation_total DOUBLE,
|
||||
wind_total DOUBLE,
|
||||
pv DOUBLE,
|
||||
temperature DOUBLE,
|
||||
wind_speed DOUBLE,
|
||||
solar DOUBLE,
|
||||
sunshine DOUBLE,
|
||||
cloud_cover DOUBLE,
|
||||
precipitation DOUBLE
|
||||
)
|
||||
""")
|
||||
|
||||
|
||||
def upsert_raw(con: duckdb.DuckDBPyConnection, table_name: str, df: pl.DataFrame):
|
||||
"""Inserts raw data using explicit columns to match the target table schema."""
|
||||
if df.is_empty():
|
||||
return
|
||||
|
||||
cols = con.execute(f"DESCRIBE {table_name}").pl()["column_name"].to_list()
|
||||
df_to_insert = df.select([c for c in cols if c in df.columns])
|
||||
|
||||
con.execute(
|
||||
f"INSERT INTO {table_name} SELECT * FROM df_to_insert ON CONFLICT (timestamp) DO NOTHING;"
|
||||
)
|
||||
|
||||
|
||||
def upsert_combined(con: duckdb.DuckDBPyConnection, df: pl.DataFrame):
|
||||
"""Inserts business-ready data into the Gold layer."""
|
||||
if df.is_empty():
|
||||
return
|
||||
con.execute(
|
||||
"INSERT INTO combined SELECT * FROM df ON CONFLICT (timestamp) DO NOTHING;"
|
||||
)
|
||||
41
packages/common/common/utils/request_utils.py
Normal file
41
packages/common/common/utils/request_utils.py
Normal file
|
|
@ -0,0 +1,41 @@
|
|||
import logging
|
||||
|
||||
import requests
|
||||
from tenacity import (
|
||||
retry,
|
||||
retry_if_exception_type,
|
||||
stop_after_attempt,
|
||||
wait_exponential,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@retry(
|
||||
stop=stop_after_attempt(3),
|
||||
wait=wait_exponential(multiplier=1, min=2, max=10),
|
||||
retry=retry_if_exception_type((requests.exceptions.RequestException,)),
|
||||
reraise=True,
|
||||
)
|
||||
def make_requests(
|
||||
url: str, headers: dict = None, params: dict = None, timeout: int = 20
|
||||
) -> dict:
|
||||
headers = headers or {}
|
||||
params = params or {}
|
||||
try:
|
||||
logger.debug(f"Requesting URL: {url} with params: {params}")
|
||||
res = requests.get(url, headers=headers, params=params, timeout=timeout)
|
||||
res.raise_for_status()
|
||||
return res.json()
|
||||
except requests.exceptions.HTTPError as errh:
|
||||
logger.error(f"HTTP Error: {errh}")
|
||||
raise
|
||||
except requests.ConnectionError as errc:
|
||||
logger.error(f"Connection Error: {errc}")
|
||||
raise
|
||||
except requests.exceptions.Timeout as errt:
|
||||
logger.error(f"Timeout Error: {errt}")
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Unknown Exception: {e}")
|
||||
raise
|
||||
15
packages/common/pyproject.toml
Normal file
15
packages/common/pyproject.toml
Normal file
|
|
@ -0,0 +1,15 @@
|
|||
[project]
|
||||
name = "common"
|
||||
version = "0.1.0"
|
||||
description = "Shared logic and utilities for the Strompreis Pipeline"
|
||||
dependencies = [
|
||||
"duckdb>=1.4.4",
|
||||
"polars>=1.38.1",
|
||||
"pydantic>=2.12.5",
|
||||
"pydantic-settings>=2.12.0",
|
||||
"pyyaml>=6.0.3",
|
||||
]
|
||||
|
||||
[build-system]
|
||||
requires = ["hatchling"]
|
||||
build-backend = "hatchling.build"
|
||||
Loading…
Add table
Add a link
Reference in a new issue