refactor: modernize project and improve error handling and documentation

This commit is contained in:
Patryk Hegenberg 2026-02-11 15:24:26 +01:00
parent e27bf2a098
commit d87d0cce11
17 changed files with 1793 additions and 188 deletions

View file

@ -1,76 +1,40 @@
"""
Data transformation logic for price and weather data.
"""
from datetime import timedelta
import polars as pl
def transform_weather(df: pl.DataFrame) -> pl.DataFrame:
"""
Cleans and standardizes raw weather data.
"""
if df.is_empty():
return pl.DataFrame()
relevant_cols = [
"timestamp", "temperature", "wind_speed", "solar",
"sunshine", "cloud_cover", "precipitation"
]
# Filter for existing relevant columns
cols_to_keep = [c for c in relevant_cols if c in df.columns]
return df.select(cols_to_keep).with_columns(
pl.col("timestamp")
.str.to_datetime(format="%Y-%m-%dT%H:%M:%S%z", time_zone="UTC")
.cast(pl.Datetime("ms", time_zone="UTC"))
).sort("timestamp")
def transform_prices(df: pl.DataFrame) -> pl.DataFrame:
"""
Transforms raw SMARD timestamps to UTC Datetime.
"""
if df.is_empty():
return pl.DataFrame()
return df.with_columns(
[
pl.col("timestamp")
.cast(pl.Datetime("ms"))
.dt.replace_time_zone("UTC")
.alias("timestamp")
]
)
def transform_weather(df: pl.DataFrame) -> pl.DataFrame:
return df.with_columns(
[
pl.col("timestamp")
.str.to_datetime(
format="%Y-%m-%dT%H:%M:%S%z",
time_zone="UTC",
)
.cast(pl.Datetime("ms", time_zone="UTC"))
.alias("timestamp")
]
)
def join_dataframes(df_prices: pl.DataFrame, df_weather: pl.DataFrame) -> pl.DataFrame:
return df_prices.join(df_weather, on="timestamp", how="inner").select(
[
pl.col("timestamp"),
pl.col("price"),
pl.col(
[
"temperature",
"wind_speed",
"solar",
"sunshine",
"cloud_cover",
"precipitation",
]
),
]
)
#
#
# def join_dataframes(df_prices: pl.DataFrame, df_weather: pl.DataFrame) -> pl.DataFrame:
# return (
# df_prices.sort("timestamp") # ← Pflicht!
# .join_asof(
# df_weather.sort("timestamp"), # ← Pflicht!
# on="timestamp",
# strategy="nearest", # ← nearest statt backward!
# tolerance=timedelta(hours=1),
# )
# .select(
# [
# pl.col("timestamp"),
# pl.col("price"),
# pl.col(
# [
# "temperature",
# "wind_speed",
# "solar",
# "sunshine",
# "cloud_cover",
# "precipitation",
# ]
# ),
# ]
# )
# )
pl.col("timestamp")
.cast(pl.Datetime("ms"))
.dt.replace_time_zone("UTC")
).sort("timestamp")