Source code for rws_py.dwd_analyze_store_second_threshold

# -*- coding: utf-8 -*-
"""
Created on Thu Jan 16 11:16:16 2025

@author: atakan
"""

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import requests
import zipfile
import numpy as np
import os

from numpy.polynomial import Polynomial
from scipy import signal

from rws_py import weather_load_storage
from rws_py import generate_load_curve
from rws_py import analyze_residual
from rws_py import RESULTS_DIR, WEATHER_DATA_DIR


pd.set_option('display.max_columns', None)

WITH_LOAD = True  # analyze the weather data togeher with an estimated load or not

SOLAR_THRESHOLD = 0  # J/cm2 [x10000/600 -> W/m²] ; ca. 100 W/m2
# km/h (converted: 1 m/s = 3.6 km/h), energy conversion:cubic
WIND_THRESHOLD = 0


# URLs for DWD data Düsseldorf/Cuxhaven
stations = {"Cuxhaven": '00891',
            "Düsseldorf": '01078',
            'Freiburg': '01443'}
# stations = {"Cuxhaven": '00891'}
# ACT_STATION = "Düsseldorf"
SOLAR_URL_BASE = "https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/10_minutes/solar/historical"
WIND_URL_BASE = "https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/10_minutes/wind/historical"

# SOLAR_ACT = '10minutenwerte_SOLAR_' + \
#     stations[ACT_STATION]+'_20200101_20231231_hist.zip'
# WIND_ACT = '10minutenwerte_wind_' + \
#     stations[ACT_STATION]+'_20200101_20231231_hist.zip'
solar_columns = ['MESS_DATUM', 'QN', 'GS_10']
wind_columns = ['MESS_DATUM', 'QN', 'FF_10']

DATA_SOURCE =  WEATHER_DATA_DIR


[docs] def load_data(bse_url_in, filename_in, exp_col, directory=DATA_SOURCE, energy_column=2, wind_bounds=(3.3, 11.0, 25), normalize=True, resample=None): """ Load weatherdata from a directory if it already exists, and only download and save if it does not exist. Parameters ---------- bse_url_in : string url from websiteN. filename_in : string DESCRIPTION. exp_col : List Which column (names) are expected at least in the file. directory : string, optional folder to look for data first and store them after downloading, when they do not exist. The default is DATA_SOURCE. energy_column : Integer, optional position of thecolumn within expected_columns, where the solar energy density or wind velocity is to be found. The default is 2. wind_bounds : list or tuple of length 3, optional Minimum, nominal and maximum wind speed of turbine in m/s . The default is (3.3, 11.0, 25). normalize : Boolean, optional for normalizing the "power" column by its mean value, The default value is True. resample : String/None, optional if 'h' the data will be resampled to hourly, taken the mean(other resampling would also be ok, check the panadas resample manual). If it is None, no resampling will be performed. The default is None. Raises ------ IOError DESCRIPTION. KeyError DESCRIPTION. Returns ------- df : Pandas DataFrame The data with the important values in the power column and the datime as index. """ verbose = False filepath = os.path.join(directory, filename_in) if os.path.exists(filepath): print(f"Data found: {filename_in}") else: url_act = bse_url_in+"/"+filename_in print(f"Data not found: {filename_in}. Downloading...") response = requests.get(url_act) if response.status_code == 200: with open(filepath, 'wb') as f: f.write(response.content) print(f"File downloaded and saved: {filename_in}") else: raise IOError(f"Failed to download file: {filename_in}") with zipfile.ZipFile(filepath) as z: csv_filename = z.namelist()[0] with z.open(csv_filename) as csv_file: df = pd.read_csv(csv_file, sep=None, engine='python', header=0) # Debug: Zeige die tatsächlichen Spaltennamen if verbose: print("Actual columns:", df.columns.tolist()) df.columns = df.columns.str.strip() df = df.replace(-999, 0) # Debug: Zeige die Spaltennamen nach strip() if verbose: print("Columns after strip:", df.columns.tolist()) # Check for missing columns before proceeding missing_columns = [ col for col in exp_col if col not in df.columns] if missing_columns: raise KeyError( f"The following columns are missing in the data: {missing_columns}") energy_column = exp_col[energy_column] # Prüfe, ob das Format der MESS_DATUM-Spalte numerisch ist (Wind-Daten) if df['MESS_DATUM'].dtype == 'int64' or df['MESS_DATUM'].dtype == 'object' \ and df['MESS_DATUM'].str.isnumeric().all(): df['MESS_DATUM'] = pd.to_datetime( df['MESS_DATUM'], format='%Y%m%d%H%M') else: df['MESS_DATUM'] = pd.to_datetime(df['MESS_DATUM']) if filename_in.find("SOLAR") > 0: df["power"] = df[energy_column]*16.66 # from J/(cm2 h) to W/m2 df["solar intensity / (J/cm2/10 min)"] = df[energy_column] elif filename_in.find("wind") > 0: df["power"] = (df[energy_column])**3 df["velocity / (m/s)"] = df[energy_column] # km/h ->m/s df.loc[(df[energy_column] > wind_bounds[1]), "power"] = wind_bounds[1]**3 # reaching the nominal wind speed df.loc[(df[energy_column] < wind_bounds[0]) | (df[energy_column] > wind_bounds[2]), "power"] = 0 # too low or too high velocity else: print("Problem: Neither wind nor solar data!") df.set_index('MESS_DATUM', inplace=True) # dateime is Index now if resample is not None: df = df.drop('eor', axis=1).resample(resample).mean() df['time_diff_seconds'] = df.index.to_series().diff().dt.total_seconds() if normalize: # Jahrweise Mittelwert berechnen mean_act = df.groupby(df.index.year)['power'].transform('mean') # Originalwerte speichern df["power not normalized"] = df["power"].copy() # Jahrweise Normierung df["power"] = df["power"] / mean_act return df
[docs] def resample_to_hourly(data_in, energy): """ Resample 10-minute data to hourly data by taking the mean. """ data_hourly_in = None if energy == "Solar": data_hourly_in = data_in.resample('h').mean() elif energy == "Wind": data_hourly_in = data_in.resample('h').mean() # BA must be checked! return data_hourly_in
[docs] def analyze_weather(data, column, threshold, time_per_point=1): """ Analyze weather data based on periods above and below a threshold. """ data = data.copy() # Create sign series based on the threshold data['above_threshold'] = data[column] > threshold data['block'] = (data['above_threshold'].diff() != 0).cumsum() # Calculate period length in hours (hourly intervals -> hours) data['count'] = 1 # 1 hour per row, for aggregation data['count'] = data.groupby('block')['count'].transform( 'sum') * time_per_point # Convert to hours # Calculate the total number of inactive periods per block (scaled to hours) df_period = data.groupby('block')[column].agg( ['count', 'mean', 'std', 'sum', 'min', 'max']).reset_index() # Calculate the mean for active periods df_period['mean'] = df_period.apply( lambda row: data[data['block'] == row['block']][column].mean(), axis=1) df_period['count'] = df_period['count'] * time_per_point # Lists for active-inactive and inactive-active combinations active_inactive_list = [] inactive_active_list = [] for i in range(len(df_period)): current_row = df_period.iloc[i] current_row = current_row.copy() if i + 1 < len(df_period): next_row = df_period.iloc[i + 1] if current_row['mean'] > threshold: # Active-Inactive combination if not df_period.iloc[i + 1]['mean'] > threshold: count_inv = next_row['count'] # Add the value as a new column in current_row current_row['count_inv'] = count_inv active_inactive_list.append(current_row) else: # Inactive-Active combination if df_period.iloc[i + 1]['mean'] > threshold: count_inv = next_row['count'] # Add the value as a new column in current_row current_row['count_inv'] = count_inv inactive_active_list.append(current_row) # Ensure that the lists for active and inactive periods are 2D data # Use `.concat()` on the list of DataFrames and ensure the DataFrame is created correctly df_active_inactive = pd.DataFrame(active_inactive_list) df_inactive_active = pd.DataFrame(inactive_active_list) return df_period, df_active_inactive, df_inactive_active
# Modifizierte Visualisierungsfunktion
[docs] def plot_enhanced_analysis(data, results, column, threshold, en_t, year, fil_name): """ Create enhanced visualizations for the analysis results with improved period distribution plots """ fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12)) # Plot 1: Time series with thresholds part = data.iloc[7000:10000] sns.lineplot(data=part, y=column, x="MESS_DATUM", ax=ax1) ax1.axhline(threshold, color='k', label='Basic threshold', linestyle='--') ax1.axhline(results['direct_threshold'], color='r', label='Direct usage threshold', linestyle=':') ax1.set_title(f'Time Series with Thresholds\n{en_t}, {year}') ax1.legend() # Plot 2: Period Length Distributions (log scale) # Verbesserte Darstellung der Periodenlängen mit logarithmischer Skala ax2.set_xscale('linear') ax2.set_yscale('linear') bins = np.linspace( 1, max(results['active_inactive']['count'].max(), 1), 30) if not results['active_inactive'].empty: sns.histplot(data=results['active_inactive'], x='count', ax=ax2, label='Basic threshold periods', bins=bins, alpha=0.6) if not results['direct_period_data'].empty: sns.histplot(data=results['direct_period_data'], x='count', ax=ax2, label='Direct usage periods', color='r', bins=bins, alpha=0.4) ax2.set_title('Period Length Distribution') ax2.set_xlabel('Period length (hours)') ax2.set_ylabel('Count') ax2.legend() # Plot 3: Scatter plot of period lengths vs intensities if not results['active_inactive'].empty: sns.scatterplot(data=results['active_inactive'], x='count', y='mean', size='sum', alpha=0.6, ax=ax3) ax3.set_xscale('linear') ax3.set_title('Period Length vs Intensity') ax3.set_xlabel('Period length (hours)') ax3.set_ylabel('Mean intensity') # Plot 4: Cumulative distribution of period lengths if not results['active_inactive'].empty: counts_sorted = np.sort(results['active_inactive']['count']) cumulative = np.arange(1, len(counts_sorted) + 1) / len(counts_sorted) ax4.plot(counts_sorted, cumulative, 'b-', label='Basic threshold') if not results['direct_period_data'].empty: counts_sorted_direct = np.sort(results['direct_period_data']['count']) cumulative_direct = np.arange( 1, len(counts_sorted_direct) + 1) / len(counts_sorted_direct) ax4.plot(counts_sorted_direct, cumulative_direct, 'r-', label='Direct usage') ax4.set_xscale('log') ax4.set_title('Cumulative Distribution of Period Lengths') ax4.set_xlabel('Period length (hours)') ax4.set_ylabel('Cumulative probability') ax4.legend() plt.tight_layout() plt.savefig(fil_name + "-enhanced-analysis.png", bbox_inches='tight', dpi=300) plt.show()
[docs] def plot_analysis(data, results, column, threshold, en_t, year, fil_name, period_pause, bins=[0, 1, 2, 4, 8, 16, 32, 64, 128, 200]): """ Create visualizations for the analysis results with improved period distribution plots """ # Visualize results fig0 = plt.figure() sns.scatterplot(data=results['direct_inactive'], x='count', y='count_inv', size='sum', hue='mean') plt.title(f'Active vs Inactive Periods, {en_t}, {year}') plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0) plt.tight_layout() plt.savefig(str(fil_name)+"-periods.png") plt.show() # plt.figure() histo2, xedges, yedges = np.histogram2d( results['direct_inactive']['count_inv'], results['direct_inactive']['count'], bins=bins) fig0, ax0 = plt.subplots(1) ax0.imshow(histo2) xranges = [ f"period {xedges[i]:.1f} - {xedges[i+1]:.1f}" for i in range(len(xedges) - 1)] yranges = [ f"pause {yedges[i]:.1f} - {yedges[i+1]:.1f}" for i in range(len(xedges) - 1)] ax0.set_xticks(np.arange(len(xranges)), labels=xranges, rotation=45, ha="right", rotation_mode="anchor") ax0.set_yticks(np.arange(len(yranges)), labels=yranges) for i in range(len(yranges)): for j in range(len(xranges)): if histo2[i, j] > 0: ax0.text(j, i, f"{histo2[i, j]:.0f}", ha="center", va="center", color="w", fontsize=10) ax0.set_title(fil_name+", 2D Histogram") fig0.tight_layout() fig0.savefig(fil_name+"-2D-hist.png") plt.show() df = pd.DataFrame(histo2.T, index=yranges, columns=xranges) # xedges als eigene Spalte hinzufügen df.insert(0, "y_edges_start", yedges[:-1]) # CSV speichern df.to_csv(fil_name+"histogram_data.csv") fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(ncols=2, nrows=2) pcm = ax1.pcolormesh(xedges, yedges, histo2, cmap='rainbow') fig.colorbar(pcm, ax=ax1) ax1.set_xlabel("Period length / h") ax1.set_ylabel("pause length / h") sum_count_fit = Polynomial.fit( results['direct_inactive']['count'], results['direct_inactive']['sum'], 2) ax2.plot(results['direct_inactive']['count'], results['direct_inactive']['sum'], "+k") ax2.plot(results['direct_inactive']['count'], sum_count_fit( results['direct_inactive']['count']), '.') ax2.set_xlabel("Period length / h") ax2.set_ylabel("total intensity") ax3.plot(period_pause['count'], period_pause['mean'], "+k") ax3.set_xlabel("Period length / h") ax3.set_ylabel("mean intensity") act_corr = signal.correlate(data[column], data[column]) # f, Pxx_den = signal.periodogram(data[expected_columns[2]], fs=1/600) # ax4.semilogy(f, Pxx_den) ax4.plot(act_corr) ax4.set_xlabel("time shift / h") ax4.set_ylabel("autocorrelation") fig.suptitle(en_t) fig.savefig(fil_name+"-cor-hist.png") plt.show() # for illustration fff, (aaa, aab) = plt.subplots(2) part = data.iloc[7000:10000] sns.lineplot(data=part, y=column, x="MESS_DATUM", ax=aaa) aaa.axhline(threshold, color='k') aaa.set_title(en_t) # Erstelle den Inlay-Plot # Erstellen Sie Achsen für den Inlay-Plot sns.lineplot(data=data, y=column, x="MESS_DATUM", ax=aab) aab.axhline(threshold, color="k") aab.set_title('All Data') # optional fff.savefig(fil_name+"-timeseries.png") plt.show() fig_n, axn = plt.subplots(1) results['direct_inactive'].hist( column='count_inv', ax=axn, bins=bins, log=True) results['direct_inactive'].hist( column='count', ax=axn, bins=bins, rwidth=.75, log=True) axn.set_xlabel("period/pause duration / h") axn.legend(["pause", 'with']) axn.set_title(en_t+", histogram")
if __name__ == "__main__": VERBOSE = True columns = ["Station", "Year", "power", "Threshold", "sum", "mean", "10 min periods", "periods>1h", "sum with energy", "sum without energy", "ratio of period lengths", "mean_active_periods", "direct_threshold", "storage_periods_count", "period_mean_duration", "mean_pause_duration", 'std_pause_duration', 'std_duration'] evaluation = pd.DataFrame(columns=columns) outp = [] histos = [] BINS = 30 all_dat = [] ENERGY_COLUMN = "power" # Name of the column with the energy proportional data # Prepare data sources for ACT_STATION in stations.keys(): SOLAR_ACT = '10minutenwerte_SOLAR_' + \ stations[ACT_STATION]+'_20200101_20240902_hist.zip' WIND_ACT = '10minutenwerte_wind_' + \ stations[ACT_STATION]+'_20200101_20241231_hist.zip' all_dat.append([SOLAR_URL_BASE, SOLAR_ACT, solar_columns, SOLAR_THRESHOLD, ACT_STATION]) all_dat.append([WIND_URL_BASE, WIND_ACT, wind_columns, WIND_THRESHOLD, ACT_STATION]) # Main analysis loop for dat_act in all_dat: base_url = dat_act[0] filename = dat_act[1] expected_columns = dat_act[2] thresh_out = dat_act[3] ACT_STATION = dat_act[4] # Load data and # Resample data to hourly data_all = load_data(base_url, filename, expected_columns, resample="h") # Process each year for year in range(2020, 2022): print(f"year: {year}\n") en_t = ACT_STATION+", " if dat_act[0].count('wind'): act_energy = "Wind" else: act_energy = "Solar" en_t += act_energy f_name = f"{en_t}, {year}" f_base=f_name.replace(", ", "-") dir_name = RESULTS_DIR fil_name = dir_name / f_base data_hourly = data_all[data_all.index.year == year] description = data_hourly.describe() description.to_csv(f"{fil_name.stem}-describe.csv", sep=";") if WITH_LOAD: load_par = { 'min_weekday': 0.650, 'max_weekday': 1.0, 'min_weekend': 0.650, 'max_weekend': 0.85, # only for step-function 'intermediate_fraction': .75, # scales everything 'multiplier': data_hourly["power"].mean() } load_df = generate_load_curve( data_hourly.index, load_parameters=load_par) with_load = weather_load_storage(data_hourly, load_df) # BA with load must be stored and evaluated/plotted if VERBOSE: print(description) # Perform enhanced analysis (includes basic analysis) if WITH_LOAD: data_hourly = with_load.copy() results_act = analyze_residual(data_hourly, "residual-no-storage") else: pass # Extract results for further processing period_pause = results_act['active_inactive'] long_per_pause = period_pause[period_pause['count'] >= 1] # Calculate metrics sum_all = data_hourly[ENERGY_COLUMN].sum() mean_all = data_hourly[ENERGY_COLUMN].mean() sh_periods = period_pause['count'].count() long_periods = long_per_pause['count'].count() periods_with_energy = period_pause['count'].sum() periods_without_energy = period_pause['count_inv'].sum() per_ratio = periods_with_energy / \ periods_without_energy if periods_without_energy != 0 else 0 # Print analysis results print( f"Threshold:{thresh_out:.1f}, No.-10 min periods: {sh_periods}, above 1h: {long_periods}") print( f"Sum of all times, with energy {periods_with_energy:.1f}, without {periods_without_energy:.1f}, ratio:{per_ratio:.3f} \n") # Create evaluation row eval_row = [ ACT_STATION, year, act_energy, thresh_out, sum_all, mean_all, sh_periods, long_periods, periods_with_energy, periods_without_energy, per_ratio, results_act['basic_stats']['mean_active_periods'], results_act['direct_threshold'], results_act['storage_periods']['count'], results_act['storage_periods']['mean_duration'], results_act['storage_periods']['mean_pause_duration'], results_act['storage_periods']['std_pause_duration'], results_act['storage_periods']['std_duration'], ] # Add to evaluation DataFrame evaluation.loc[len(evaluation)] = eval_row # Create visualizations plot_analysis( data_hourly, results_act, ENERGY_COLUMN, thresh_out, en_t, year, str(fil_name)+"-H-", period_pause) plot_enhanced_analysis( data_hourly, results_act, ENERGY_COLUMN, thresh_out, en_t, year, str(fil_name)+"-H-") # Save final results evaluation.to_csv(dir_name/("H-"+"all-eval-enhanced.csv"), sep=";", encoding='utf-8')