Source code for rws_py.dwd_analyze_store_second_threshold

# -*- coding: utf-8 -*-
"""
Created on Thu Jan 16 11:16:16 2025

@author: atakan
"""

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import requests
import zipfile
import numpy as np
import os

from numpy.polynomial import Polynomial
from scipy import signal

from rws_py import weather_load_storage
from rws_py import generate_load_curve
from rws_py import analyze_residual
from rws_py import RESULTS_DIR, WEATHER_DATA_DIR


pd.set_option('display.max_columns', None)

WITH_LOAD = True  # analyze the weather data togeher with an estimated load or not

SOLAR_THRESHOLD = 0  # J/cm2 [x10000/600 -> W/m²] ; ca. 100 W/m2
# km/h (converted: 1 m/s = 3.6 km/h), energy conversion:cubic
WIND_THRESHOLD = 0


# URLs for DWD data Düsseldorf/Cuxhaven
stations = {"Cuxhaven": '00891',
            "Düsseldorf": '01078',
            'Freiburg': '01443'}
# stations = {"Cuxhaven": '00891'}
# ACT_STATION = "Düsseldorf"
SOLAR_URL_BASE = "https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/10_minutes/solar/historical"
WIND_URL_BASE = "https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/10_minutes/wind/historical"

# SOLAR_ACT = '10minutenwerte_SOLAR_' + \
#     stations[ACT_STATION]+'_20200101_20231231_hist.zip'
# WIND_ACT = '10minutenwerte_wind_' + \
#     stations[ACT_STATION]+'_20200101_20231231_hist.zip'
solar_columns = ['MESS_DATUM', 'QN', 'GS_10']
wind_columns = ['MESS_DATUM', 'QN', 'FF_10']

DATA_SOURCE =  WEATHER_DATA_DIR



[docs]
def load_data(bse_url_in, filename_in, exp_col, directory=DATA_SOURCE,
              energy_column=2,
              wind_bounds=(3.3, 11.0, 25),
              normalize=True,
              resample=None):
    """
    Load weatherdata from a directory if it already exists,
    and only download and save if it does not exist.


    Parameters
    ----------
    bse_url_in : string
        url from websiteN.
    filename_in : string
        DESCRIPTION.
    exp_col : List
        Which column (names) are expected at least in the file.
    directory : string, optional
        folder to look for data first and store them after downloading, when
        they do not exist. The default is DATA_SOURCE.
    energy_column : Integer, optional
        position of thecolumn within expected_columns, where the solar energy
        density or wind velocity is to be found. The default is 2.
    wind_bounds : list or tuple of length 3, optional
        Minimum, nominal and maximum wind speed of turbine in m/s . The default
        is (3.3, 11.0, 25).
    normalize : Boolean, optional
        for normalizing the "power" column by its mean value, The default
        value is True.
    resample : String/None, optional
        if 'h' the data will be resampled to hourly, taken the mean(other
        resampling would also be ok, check the panadas resample manual). If it
        is None, no resampling will be performed. The default is None.

    Raises
    ------
    IOError
        DESCRIPTION.
    KeyError
        DESCRIPTION.

    Returns
    -------
    df : Pandas DataFrame
        The data with the important values in the power column and the datime
        as index.

    """

    verbose = False
    filepath = os.path.join(directory, filename_in)
    if os.path.exists(filepath):
        print(f"Data found: {filename_in}")
    else:
        url_act = bse_url_in+"/"+filename_in
        print(f"Data not found: {filename_in}. Downloading...")
        response = requests.get(url_act)
        if response.status_code == 200:
            with open(filepath, 'wb') as f:
                f.write(response.content)
            print(f"File downloaded and saved: {filename_in}")
        else:
            raise IOError(f"Failed to download file: {filename_in}")

    with zipfile.ZipFile(filepath) as z:
        csv_filename = z.namelist()[0]
        with z.open(csv_filename) as csv_file:
            df = pd.read_csv(csv_file, sep=None, engine='python', header=0)

    # Debug: Zeige die tatsächlichen Spaltennamen
    if verbose:
        print("Actual columns:", df.columns.tolist())

    df.columns = df.columns.str.strip()
    df = df.replace(-999, 0)

    # Debug: Zeige die Spaltennamen nach strip()
    if verbose:
        print("Columns after strip:", df.columns.tolist())

    # Check for missing columns before proceeding
    missing_columns = [
        col for col in exp_col if col not in df.columns]
    if missing_columns:
        raise KeyError(
            f"The following columns are missing in the data: {missing_columns}")
    energy_column = exp_col[energy_column]

    # Prüfe, ob das Format der MESS_DATUM-Spalte numerisch ist (Wind-Daten)
    if df['MESS_DATUM'].dtype == 'int64' or df['MESS_DATUM'].dtype == 'object' \
        and df['MESS_DATUM'].str.isnumeric().all():
        df['MESS_DATUM'] = pd.to_datetime(
            df['MESS_DATUM'], format='%Y%m%d%H%M')

    else:
        df['MESS_DATUM'] = pd.to_datetime(df['MESS_DATUM'])

    if filename_in.find("SOLAR") > 0:
        df["power"] = df[energy_column]*16.66  # from J/(cm2 h) to W/m2

        df["solar intensity / (J/cm2/10 min)"] = df[energy_column]
    elif filename_in.find("wind") > 0:
        df["power"] = (df[energy_column])**3
        df["velocity / (m/s)"] = df[energy_column]  # km/h ->m/s

        df.loc[(df[energy_column] > wind_bounds[1]),
               "power"] = wind_bounds[1]**3  # reaching the nominal wind speed
        df.loc[(df[energy_column] < wind_bounds[0]) | (df[energy_column]
                                                       > wind_bounds[2]),
               "power"] = 0  # too low or too high velocity

    else:
        print("Problem: Neither wind nor solar data!")


    df.set_index('MESS_DATUM', inplace=True)  # dateime is Index now
    if resample is not None:
        df = df.drop('eor', axis=1).resample(resample).mean()
    df['time_diff_seconds'] = df.index.to_series().diff().dt.total_seconds()

    if normalize:
        # Jahrweise Mittelwert berechnen
        mean_act = df.groupby(df.index.year)['power'].transform('mean')

        # Originalwerte speichern
        df["power not normalized"] = df["power"].copy()

        # Jahrweise Normierung
        df["power"] = df["power"] / mean_act


    return df




[docs]
def resample_to_hourly(data_in, energy):
    """
    Resample 10-minute data to hourly data by taking the mean.
    """

    data_hourly_in = None
    if energy == "Solar":

        data_hourly_in = data_in.resample('h').mean()

    elif energy == "Wind":

        data_hourly_in = data_in.resample('h').mean()  # BA must be checked!
    return data_hourly_in




[docs]
def analyze_weather(data, column, threshold, time_per_point=1):
    """
    Analyze weather data based on periods above and below a threshold.
    """
    data = data.copy()
    # Create sign series based on the threshold
    data['above_threshold'] = data[column] > threshold
    data['block'] = (data['above_threshold'].diff() != 0).cumsum()

    # Calculate period length in hours (hourly intervals -> hours)
    data['count'] = 1  # 1 hour per row, for aggregation
    data['count'] = data.groupby('block')['count'].transform(
        'sum') * time_per_point  # Convert to hours

    # Calculate the total number of inactive periods per block (scaled to hours)
    df_period = data.groupby('block')[column].agg(
        ['count', 'mean', 'std', 'sum', 'min', 'max']).reset_index()

    # Calculate the mean for active periods
    df_period['mean'] = df_period.apply(
        lambda row: data[data['block'] == row['block']][column].mean(), axis=1)
    df_period['count'] = df_period['count'] * time_per_point

    # Lists for active-inactive and inactive-active combinations
    active_inactive_list = []
    inactive_active_list = []

    for i in range(len(df_period)):
        current_row = df_period.iloc[i]
        current_row = current_row.copy()
        if i + 1 < len(df_period):
            next_row = df_period.iloc[i + 1]
            if current_row['mean'] > threshold:
                # Active-Inactive combination
                if not df_period.iloc[i + 1]['mean'] > threshold:
                    count_inv = next_row['count']
                    # Add the value as a new column in current_row
                    current_row['count_inv'] = count_inv
                    active_inactive_list.append(current_row)
            else:
                # Inactive-Active combination
                if df_period.iloc[i + 1]['mean'] > threshold:
                    count_inv = next_row['count']
                    # Add the value as a new column in current_row
                    current_row['count_inv'] = count_inv
                    inactive_active_list.append(current_row)

    # Ensure that the lists for active and inactive periods are 2D data
    # Use `.concat()` on the list of DataFrames and ensure the DataFrame is created correctly
    df_active_inactive = pd.DataFrame(active_inactive_list)
    df_inactive_active = pd.DataFrame(inactive_active_list)

    return df_period, df_active_inactive, df_inactive_active



# Modifizierte Visualisierungsfunktion



[docs]
def plot_enhanced_analysis(data, results, column, threshold, en_t, year, fil_name):
    """
    Create enhanced visualizations for the analysis results with improved period distribution plots
    """
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))

    # Plot 1: Time series with thresholds
    part = data.iloc[7000:10000]
    sns.lineplot(data=part, y=column, x="MESS_DATUM", ax=ax1)
    ax1.axhline(threshold, color='k', label='Basic threshold', linestyle='--')
    ax1.axhline(results['direct_threshold'], color='r',
                label='Direct usage threshold', linestyle=':')
    ax1.set_title(f'Time Series with Thresholds\n{en_t}, {year}')
    ax1.legend()

    # Plot 2: Period Length Distributions (log scale)
    # Verbesserte Darstellung der Periodenlängen mit logarithmischer Skala
    ax2.set_xscale('linear')
    ax2.set_yscale('linear')
    bins = np.linspace(
        1, max(results['active_inactive']['count'].max(), 1), 30)
    if not results['active_inactive'].empty:
        sns.histplot(data=results['active_inactive'], x='count', ax=ax2,
                     label='Basic threshold periods', bins=bins, alpha=0.6)

    if not results['direct_period_data'].empty:
        sns.histplot(data=results['direct_period_data'], x='count', ax=ax2,
                     label='Direct usage periods', color='r', bins=bins, alpha=0.4)

    ax2.set_title('Period Length Distribution')
    ax2.set_xlabel('Period length (hours)')
    ax2.set_ylabel('Count')
    ax2.legend()

    # Plot 3: Scatter plot of period lengths vs intensities
    if not results['active_inactive'].empty:
        sns.scatterplot(data=results['active_inactive'],
                        x='count', y='mean',
                        size='sum',
                        alpha=0.6,
                        ax=ax3)
        ax3.set_xscale('linear')
        ax3.set_title('Period Length vs Intensity')
        ax3.set_xlabel('Period length (hours)')
        ax3.set_ylabel('Mean intensity')

    # Plot 4: Cumulative distribution of period lengths
    if not results['active_inactive'].empty:
        counts_sorted = np.sort(results['active_inactive']['count'])
        cumulative = np.arange(1, len(counts_sorted) + 1) / len(counts_sorted)
        ax4.plot(counts_sorted, cumulative, 'b-', label='Basic threshold')

    if not results['direct_period_data'].empty:
        counts_sorted_direct = np.sort(results['direct_period_data']['count'])
        cumulative_direct = np.arange(
            1, len(counts_sorted_direct) + 1) / len(counts_sorted_direct)
        ax4.plot(counts_sorted_direct, cumulative_direct,
                 'r-', label='Direct usage')

    ax4.set_xscale('log')
    ax4.set_title('Cumulative Distribution of Period Lengths')
    ax4.set_xlabel('Period length (hours)')
    ax4.set_ylabel('Cumulative probability')
    ax4.legend()

    plt.tight_layout()
    plt.savefig(fil_name + "-enhanced-analysis.png",
                bbox_inches='tight', dpi=300)
    plt.show()




[docs]
def plot_analysis(data, results, column, threshold, en_t, year, fil_name,
                  period_pause,
                  bins=[0, 1, 2, 4, 8, 16, 32, 64, 128, 200]):
    """
    Create  visualizations for the analysis results with improved period distribution plots
    """
    # Visualize results
    fig0 = plt.figure()

    sns.scatterplot(data=results['direct_inactive'],
                    x='count', y='count_inv', size='sum', hue='mean')
    plt.title(f'Active vs Inactive Periods, {en_t}, {year}')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0)
    plt.tight_layout()
    plt.savefig(str(fil_name)+"-periods.png")
    plt.show()
    # plt.figure()
    histo2, xedges, yedges = np.histogram2d(
        results['direct_inactive']['count_inv'], results['direct_inactive']['count'], bins=bins)

    fig0, ax0 = plt.subplots(1)

    ax0.imshow(histo2)
    xranges = [
        f"period {xedges[i]:.1f} - {xedges[i+1]:.1f}" for i in range(len(xedges) - 1)]
    yranges = [
        f"pause {yedges[i]:.1f} - {yedges[i+1]:.1f}" for i in range(len(xedges) - 1)]
    ax0.set_xticks(np.arange(len(xranges)), labels=xranges,
                   rotation=45, ha="right", rotation_mode="anchor")
    ax0.set_yticks(np.arange(len(yranges)), labels=yranges)
    for i in range(len(yranges)):
        for j in range(len(xranges)):
            if histo2[i, j] > 0:
                ax0.text(j, i, f"{histo2[i, j]:.0f}",
                                ha="center", va="center", color="w", fontsize=10)

    ax0.set_title(fil_name+", 2D Histogram")
    fig0.tight_layout()
    fig0.savefig(fil_name+"-2D-hist.png")
    plt.show()
    df = pd.DataFrame(histo2.T, index=yranges, columns=xranges)

    # xedges als eigene Spalte hinzufügen
    df.insert(0, "y_edges_start", yedges[:-1])

    # CSV speichern
    df.to_csv(fil_name+"histogram_data.csv")

    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(ncols=2, nrows=2)

    pcm = ax1.pcolormesh(xedges, yedges, histo2, cmap='rainbow')
    fig.colorbar(pcm, ax=ax1)
    ax1.set_xlabel("Period length / h")
    ax1.set_ylabel("pause length / h")
    sum_count_fit = Polynomial.fit(
        results['direct_inactive']['count'], results['direct_inactive']['sum'], 2)
    ax2.plot(results['direct_inactive']['count'],
             results['direct_inactive']['sum'], "+k")
    ax2.plot(results['direct_inactive']['count'], sum_count_fit(
        results['direct_inactive']['count']), '.')
    ax2.set_xlabel("Period length / h")
    ax2.set_ylabel("total intensity")
    ax3.plot(period_pause['count'], period_pause['mean'], "+k")
    ax3.set_xlabel("Period length / h")
    ax3.set_ylabel("mean intensity")

    act_corr = signal.correlate(data[column], data[column])
    # f, Pxx_den = signal.periodogram(data[expected_columns[2]], fs=1/600)
    # ax4.semilogy(f, Pxx_den)
    ax4.plot(act_corr)
    ax4.set_xlabel("time shift / h")
    ax4.set_ylabel("autocorrelation")
    fig.suptitle(en_t)
    fig.savefig(fil_name+"-cor-hist.png")
    plt.show()

    # for illustration
    fff, (aaa, aab) = plt.subplots(2)
    part = data.iloc[7000:10000]
    sns.lineplot(data=part, y=column, x="MESS_DATUM", ax=aaa)
    aaa.axhline(threshold, color='k')
    aaa.set_title(en_t)
    # Erstelle den Inlay-Plot
    # Erstellen Sie Achsen für den Inlay-Plot

    sns.lineplot(data=data, y=column, x="MESS_DATUM", ax=aab)
    aab.axhline(threshold, color="k")
    aab.set_title('All Data')  # optional
    fff.savefig(fil_name+"-timeseries.png")
    plt.show()
    fig_n, axn = plt.subplots(1)

    results['direct_inactive'].hist(
        column='count_inv', ax=axn, bins=bins, log=True)
    results['direct_inactive'].hist(
        column='count', ax=axn, bins=bins, rwidth=.75, log=True)
    axn.set_xlabel("period/pause duration / h")
    axn.legend(["pause", 'with'])

    axn.set_title(en_t+", histogram")



if __name__ == "__main__":
    VERBOSE = True

    columns = ["Station", "Year", "power", "Threshold", "sum", "mean",
               "10 min periods", "periods>1h", "sum with energy",
               "sum without energy", "ratio of period lengths",
               "mean_active_periods", "direct_threshold",
               "storage_periods_count", "period_mean_duration", "mean_pause_duration",
               'std_pause_duration',
               'std_duration']

    evaluation = pd.DataFrame(columns=columns)
    outp = []
    histos = []
    BINS = 30

    all_dat = []
    ENERGY_COLUMN = "power"  # Name of the column with the energy proportional data

    # Prepare data sources
    for ACT_STATION in stations.keys():
        SOLAR_ACT = '10minutenwerte_SOLAR_' + \
            stations[ACT_STATION]+'_20200101_20240902_hist.zip'
        WIND_ACT = '10minutenwerte_wind_' + \
            stations[ACT_STATION]+'_20200101_20241231_hist.zip'
        all_dat.append([SOLAR_URL_BASE, SOLAR_ACT,
                       solar_columns, SOLAR_THRESHOLD, ACT_STATION])
        all_dat.append([WIND_URL_BASE, WIND_ACT, wind_columns,
                       WIND_THRESHOLD, ACT_STATION])

    # Main analysis loop
    for dat_act in all_dat:
        base_url = dat_act[0]
        filename = dat_act[1]
        expected_columns = dat_act[2]
        thresh_out = dat_act[3]
        ACT_STATION = dat_act[4]

        # Load data and  # Resample data to hourly
        data_all = load_data(base_url, filename, expected_columns,
                             resample="h")

        # Process each year
        for year in range(2020, 2022):
            print(f"year: {year}\n")
            en_t = ACT_STATION+", "
            if dat_act[0].count('wind'):
                act_energy = "Wind"
            else:
                act_energy = "Solar"
            en_t += act_energy
            f_name = f"{en_t}, {year}"
            f_base=f_name.replace(", ", "-")
            dir_name = RESULTS_DIR

            fil_name = dir_name / f_base
            data_hourly = data_all[data_all.index.year == year]

           
            description = data_hourly.describe()
            description.to_csv(f"{fil_name.stem}-describe.csv", sep=";")

            if WITH_LOAD:
                load_par = {
                    'min_weekday': 0.650,
                    'max_weekday': 1.0,
                    'min_weekend': 0.650,
                    'max_weekend': 0.85,  # only for step-function
                    'intermediate_fraction': .75,
                    # scales everything
                    'multiplier': data_hourly["power"].mean()
                }
                load_df = generate_load_curve(
                    data_hourly.index, load_parameters=load_par)
                with_load = weather_load_storage(data_hourly, load_df)
                # BA with load must be stored and evaluated/plotted

            if VERBOSE:
                print(description)
            # Perform enhanced analysis (includes basic analysis)
            if WITH_LOAD:
                data_hourly = with_load.copy()
                results_act = analyze_residual(data_hourly, "residual-no-storage")

            else:
                pass

            # Extract results for further processing
            period_pause = results_act['active_inactive']
            long_per_pause = period_pause[period_pause['count'] >= 1]

            # Calculate metrics
            sum_all = data_hourly[ENERGY_COLUMN].sum()
            mean_all = data_hourly[ENERGY_COLUMN].mean()
            sh_periods = period_pause['count'].count()
            long_periods = long_per_pause['count'].count()
            periods_with_energy = period_pause['count'].sum()
            periods_without_energy = period_pause['count_inv'].sum()
            per_ratio = periods_with_energy / \
                periods_without_energy if periods_without_energy != 0 else 0

            # Print analysis results
            print(
                f"Threshold:{thresh_out:.1f}, No.-10 min periods: {sh_periods}, above 1h: {long_periods}")
            print(
                f"Sum of all times, with energy {periods_with_energy:.1f}, without {periods_without_energy:.1f}, ratio:{per_ratio:.3f} \n")

            # Create evaluation row
            eval_row = [
                ACT_STATION,
                year,
                act_energy,
                thresh_out,
                sum_all,
                mean_all,
                sh_periods,
                long_periods,
                periods_with_energy,
                periods_without_energy,
                per_ratio,
                results_act['basic_stats']['mean_active_periods'],
                results_act['direct_threshold'],
                results_act['storage_periods']['count'],
                results_act['storage_periods']['mean_duration'],
                results_act['storage_periods']['mean_pause_duration'],
                results_act['storage_periods']['std_pause_duration'],
                results_act['storage_periods']['std_duration'],
            ]

            # Add to evaluation DataFrame
            evaluation.loc[len(evaluation)] = eval_row

            # Create visualizations
            plot_analysis(
                data_hourly, results_act, ENERGY_COLUMN, thresh_out, en_t, year,
                str(fil_name)+"-H-",
                period_pause)
            plot_enhanced_analysis(
                data_hourly, results_act, ENERGY_COLUMN, thresh_out, en_t, year,
                str(fil_name)+"-H-")

    # Save final results
    evaluation.to_csv(dir_name/("H-"+"all-eval-enhanced.csv"),
                      sep=";", encoding='utf-8')
Source code for rws_py.dwd_analyze_store_second_threshold

Residual Loads for Carnot Batteries

Navigation

Related Topics