refactor: seperate system into single Dockerfiles

This commit is contained in:
Patryk Hegenberg 2026-02-18 12:47:22 +01:00
parent ad87f702f1
commit ed803a2ca5
26 changed files with 238 additions and 85 deletions

View file

View file

@ -0,0 +1 @@
"A Python package to get hourly energy prices and weather"

View file

@ -0,0 +1,58 @@
"""
Collector for SMARD (Electricity Market Data) API.
"""
import time
import logging
import polars as pl
from ..utils import request_utils
from ..utils.config_loader import settings
logger = logging.getLogger(__name__)
def fetch_smard_data(filter_id: int, region: str = None) -> pl.DataFrame:
"""
Fetches time-series data from the SMARD API for a specific filter.
Args:
filter_id: The SMARD data identifier (e.g., 4169 for price).
region: The region code (defaults to config value).
Returns:
A Polars DataFrame with 'timestamp' (ms) and 'value'.
"""
region = region or settings.smard.region
base_url = settings.smard.base_url
now_ms = int(time.time()) * 1000
index_url = f"{base_url}/{filter_id}/{region}/index_hour.json"
try:
index_data = request_utils.make_requests(index_url)
if not index_data or "timestamps" not in index_data:
logger.warning(f"No index data for filter {filter_id}")
return pl.DataFrame()
closest_ts = min(index_data["timestamps"], key=lambda x: abs(x - now_ms))
data_url = (
f"{base_url}/{filter_id}/{region}/"
f"{filter_id}_{region}_hour_{closest_ts}.json"
)
payload = request_utils.make_requests(data_url)
series = payload.get("series", [])
clean_series = [row for row in series if row[1] is not None]
if not clean_series:
return pl.DataFrame()
return pl.DataFrame(
{
"timestamp": [row[0] for row in clean_series],
"value": [row[1] for row in clean_series],
},
schema={"timestamp": pl.Int64, "value": pl.Float64}
)
except Exception as e:
logger.error(f"Failed to fetch SMARD filter {filter_id}: {e}")
return pl.DataFrame()

View file

@ -0,0 +1,62 @@
"""
Collector for Bright Sky (Weather) API.
"""
import logging
from datetime import datetime, timedelta, timezone
import polars as pl
from ..utils import request_utils
from ..utils.config_loader import settings
logger = logging.getLogger(__name__)
def fetch_weather(lat: float = None, lon: float = None) -> pl.DataFrame:
"""
Fetches historical weather data from Bright Sky API.
Filters for the specific columns defined in the Bronze schema.
Returns:
A Polars DataFrame containing filtered raw weather parameters.
"""
lat = lat or settings.brightsky.lat
lon = lon or settings.brightsky.lon
url = settings.brightsky.base_url
current_utc = datetime.now(timezone.utc)
payload = {
"date": (current_utc - timedelta(hours=72)).isoformat(),
"last_date": current_utc.isoformat(),
"lat": lat,
"lon": lon,
"units": "dwd",
"tz": "Etc/UTC",
}
schema_cols = [
"timestamp",
"temperature",
"wind_speed",
"solar",
"sunshine",
"cloud_cover",
"precipitation",
]
try:
data = request_utils.make_requests(url, params=payload)
weather_list = data.get("weather", [])
if not weather_list:
logger.warning("No weather data returned from API.")
return pl.DataFrame()
df = pl.DataFrame(weather_list)
available_cols = [c for c in schema_cols if c in df.columns]
return df.select(available_cols)
except Exception as e:
logger.error(f"Failed to fetch weather data: {e}")
return pl.DataFrame()

View file

@ -0,0 +1,12 @@
smard:
region: "DE-LU"
filter_id: 4169
base_url: "https://www.smard.de/app/chart_data"
brightsky:
lat: 52.52
lon: 13.41
base_url: "https://api.brightsky.dev/weather"
database:
path: "output/pipeline.duckdb"

View file

@ -0,0 +1 @@
"module to transform and join dataframes"

View file

@ -0,0 +1,40 @@
"""
Data transformation logic for price and weather data.
"""
from datetime import timedelta
import polars as pl
def transform_weather(df: pl.DataFrame) -> pl.DataFrame:
"""
Cleans and standardizes raw weather data.
"""
if df.is_empty():
return pl.DataFrame()
relevant_cols = [
"timestamp", "temperature", "wind_speed", "solar",
"sunshine", "cloud_cover", "precipitation"
]
# Filter for existing relevant columns
cols_to_keep = [c for c in relevant_cols if c in df.columns]
return df.select(cols_to_keep).with_columns(
pl.col("timestamp")
.str.to_datetime(format="%Y-%m-%dT%H:%M:%S%z", time_zone="UTC")
.cast(pl.Datetime("ms", time_zone="UTC"))
).sort("timestamp")
def transform_prices(df: pl.DataFrame) -> pl.DataFrame:
"""
Transforms raw SMARD timestamps to UTC Datetime.
"""
if df.is_empty():
return pl.DataFrame()
return df.with_columns(
pl.col("timestamp")
.cast(pl.Datetime("ms"))
.dt.replace_time_zone("UTC")
).sort("timestamp")

View file

@ -0,0 +1 @@
"module with a set of utility functions"

View file

@ -0,0 +1,61 @@
from pathlib import Path
import yaml
from pydantic import BaseModel
from pydantic_settings import BaseSettings, SettingsConfigDict
class SmardConfig(BaseModel):
region: str = "DE-LU"
price_filter: int = 4169
load_forecast_filter: int = 4382
generation_total_filter: int = 122
wind_onshore_filter: int = 4069
wind_offshore_filter: int = 4068
pv_filter: int = 4070
base_url: str = "https://www.smard.de/app/chart_data"
class BrightSkyConfig(BaseModel):
lat: float = 52.52
lon: float = 13.41
base_url: str = "https://api.brightsky.dev/weather"
class DatabaseConfig(BaseModel):
path: str = "output/pipeline.duckdb"
class Settings(BaseSettings):
model_config = SettingsConfigDict(
env_file=".env",
env_nested_delimiter="__",
env_prefix="STROM_",
extra="ignore"
)
smard: SmardConfig = SmardConfig()
brightsky: BrightSkyConfig = BrightSkyConfig()
database: DatabaseConfig = DatabaseConfig()
def load_config(config_path: str | None = None) -> Settings:
if config_path:
path = Path(config_path)
else:
# Try local first (dev) then package relative
local_path = Path("config/config.yaml")
pkg_path = Path(__file__).parent.parent / "config" / "config.yaml"
path = local_path if local_path.exists() else pkg_path
if not path.exists():
return Settings()
with open(path, "r") as f:
config_data = yaml.safe_load(f) or {}
return Settings(**config_data)
# Global settings instance
settings = load_config()

View file

@ -0,0 +1,90 @@
"""
DuckDB database interface for Bronze (Raw) and Gold (Combined) layers.
"""
import duckdb
import polars as pl
from contextlib import contextmanager
from .config_loader import settings
@contextmanager
def get_connection(db_path: str = None):
"""Context manager for DuckDB connections."""
db_path = db_path or settings.database.path
con = duckdb.connect(db_path)
try:
yield con
finally:
con.close()
def init_tables(con: duckdb.DuckDBPyConnection):
"""
Initializes the database schema following the Medallion architecture.
"""
# BRONZE LAYER (Raw API responses)
smard_tables = [
"prices_raw",
"load_forecast_raw",
"gen_total_raw",
"wind_onshore_raw",
"wind_offshore_raw",
"pv_raw",
]
for table in smard_tables:
con.execute(
f"CREATE TABLE IF NOT EXISTS {table} (timestamp BIGINT PRIMARY KEY, value DOUBLE)"
)
con.execute("""
CREATE TABLE IF NOT EXISTS weather_raw (
timestamp VARCHAR PRIMARY KEY,
temperature DOUBLE,
wind_speed DOUBLE,
solar DOUBLE,
sunshine DOUBLE,
cloud_cover DOUBLE,
precipitation DOUBLE
)
""")
# GOLD LAYER (Transformed and Joined Business Data)
con.execute("""
CREATE TABLE IF NOT EXISTS combined (
timestamp TIMESTAMP PRIMARY KEY,
price DOUBLE,
load_forecast DOUBLE,
generation_total DOUBLE,
wind_total DOUBLE,
pv DOUBLE,
temperature DOUBLE,
wind_speed DOUBLE,
solar DOUBLE,
sunshine DOUBLE,
cloud_cover DOUBLE,
precipitation DOUBLE
)
""")
def upsert_raw(con: duckdb.DuckDBPyConnection, table_name: str, df: pl.DataFrame):
"""Inserts raw data using explicit columns to match the target table schema."""
if df.is_empty():
return
cols = con.execute(f"DESCRIBE {table_name}").pl()["column_name"].to_list()
df_to_insert = df.select([c for c in cols if c in df.columns])
con.execute(
f"INSERT INTO {table_name} SELECT * FROM df_to_insert ON CONFLICT (timestamp) DO NOTHING;"
)
def upsert_combined(con: duckdb.DuckDBPyConnection, df: pl.DataFrame):
"""Inserts business-ready data into the Gold layer."""
if df.is_empty():
return
con.execute(
"INSERT INTO combined SELECT * FROM df ON CONFLICT (timestamp) DO NOTHING;"
)

View file

@ -0,0 +1,41 @@
import logging
import requests
from tenacity import (
retry,
retry_if_exception_type,
stop_after_attempt,
wait_exponential,
)
logger = logging.getLogger(__name__)
@retry(
stop=stop_after_attempt(3),
wait=wait_exponential(multiplier=1, min=2, max=10),
retry=retry_if_exception_type((requests.exceptions.RequestException,)),
reraise=True,
)
def make_requests(
url: str, headers: dict = None, params: dict = None, timeout: int = 20
) -> dict:
headers = headers or {}
params = params or {}
try:
logger.debug(f"Requesting URL: {url} with params: {params}")
res = requests.get(url, headers=headers, params=params, timeout=timeout)
res.raise_for_status()
return res.json()
except requests.exceptions.HTTPError as errh:
logger.error(f"HTTP Error: {errh}")
raise
except requests.ConnectionError as errc:
logger.error(f"Connection Error: {errc}")
raise
except requests.exceptions.Timeout as errt:
logger.error(f"Timeout Error: {errt}")
raise
except Exception as e:
logger.error(f"Unknown Exception: {e}")
raise

View file

@ -0,0 +1,15 @@
[project]
name = "common"
version = "0.1.0"
description = "Shared logic and utilities for the Strompreis Pipeline"
dependencies = [
"duckdb>=1.4.4",
"polars>=1.38.1",
"pydantic>=2.12.5",
"pydantic-settings>=2.12.0",
"pyyaml>=6.0.3",
]
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"