Source code for ecodynelec.preprocessing.enr_residual_utils

"""
Helper functions to load and preprocess the renewable electricity data from Pronovo and EnergyCharts.
"""

import os
import re
from datetime import datetime

import pandas as pd

pronovo_types_map = {
    '*': {
        'Wind': 'Wind (kWh)',
        'Solar': 'Photovoltaik (kWh)',
        'Biogas': 'Biogas (kWh)',
        'Biomass_1_crops': 'Energiepflanze (kWh)',
        'Biomass_2_waste': 'Forst- und Landwirtschaftliche Abfälle (kWh)',
        'Waste_1': 'Kehrichtverbrennung (erneuerbar) (kWh)',
        'Sewage_gas': 'Klärgas (kWh)',
    },
    '5': {
        'Wind': 'Wind',
        'Solar': 'Photovoltaic',
        'Biomass_all': 'Biomasse'
    },
    '2': {
        'Wind': '-A.Windenergie [kWh]',
        'Solar': '-A.Photovoltaik [kWh]',
        'Biogas': '-A.Biogas [kWh]',
        'Biomass_1_crops': '-A.Energiepflanze [kWh]',
        'Biomass_2_waste': '-A.Forst- und Landwirtschaftliche Abfälle [kWh]',
        'Waste_1': '-A.Kehrichtverbrennung [kWh]',
        'Waste_2.50': '-A.Kehrichtverbrennung (erneuerbar).50 [kWh]',
        'Waste_3.100': '-A.Kehrichtverbrennung (erneuerbar).100 [kWh]',
        'Waste_4_no_enr': '-A.Kehrichtverbrennung (nicht erneuerbar) [kWh]',
        'Sewage_gas': '-A.Klärgas [kWh]',
        # 'Gas_1': '-A.Erdgas Dampfturbine [kWh]',
        # 'Gas_2': '-A.Gas- und Dampfkombikraftwerk [kWh]',
        # 'Gas_3': '-A.Gasturbine [kWh]',
        # 'Unknown': '-A.Leichtwasserreaktor [kWh]', #light water  -> matches nuclear production
        # 'Combustion_engine': '-A.Verbrennungsmotor [kWh]'
        'Hydro_Pumpage': '+A.Pumpspeicherkraftwerk [kWh]',
        'Hydro_Pumped_Storage': '-A.Pumpspeicherkraftwerk [kWh]'
    },
    '7': {
        'Wind': '-A.Windturbine [kWh]',
        'Solar': '-A.Photovoltaik [kWh]',
        'Biogas': '-A.Biogas [kWh]',
        'Biomass_1_crops': '-A.Energiepflanze [kWh]',
        'Biomass_2_waste': '-A.Forst- und Landwirtschaftliche Abfälle [kWh]',
        'Waste_1': '-A.Kehrichtverbrennung [kWh]',
        'Waste_2.50': '-A.Kehrichtverbrennung (erneuerbar).50 [kWh]',
        'Waste_3.100': '-A.Kehrichtverbrennung (erneuerbar).100 [kWh]',
        'Waste_4_no_enr': '-A.Kehrichtverbrennung (nicht erneuerbar) [kWh]',
        'Sewage_gas': '-A.Klärgas [kWh]',
        # 'Gas_1': '-A.Erdgas Dampfturbine [kWh]',
        # 'Gas_2': '-A.Gas- und Dampfkombikraftwerk [kWh]',
        # 'Gas_3': '-A.Gasturbine [kWh]',
        # 'Unknown': '-A.Leichtwasserreaktor [kWh]', #light water  -> matches nuclear production
        # 'Combustion_engine': '-A.Verbrennungsmotor [kWh]'
        'Hydro_Pumpage': '+A.Pumpspeicherkraft [kWh]',
        'Hydro_Pumped_Storage': '-A.Pumpspeicherkraft [kWh]'
    }
}
"""
Mapping linking pronovo column names to the actual types of plants used in this project.
There is a different mapping for each pronovo file format (see load_pronovo_file).
"""

ec_types_to_types = {
    'Photovoltaïque': 'Solar',
    'Éolienne': 'Wind',
    'Biogaz': 'Biogas',
    'Biomasse': 'Biomass_all',
    'Cultures énergétiques': 'Biomass_1_crops',
    'Déchets forestiers et agricoles': 'Biomass_2_waste',
    'Incinération': 'Waste_1',
    "Biogaz de station d'épuration": 'Sewage_gas',
    "Gaz d'égout": 'Sewage_gas'
}
"""
Mapping linking energy charts column names to the actual types of plants used in this project.
"""

data_mappings = {
    'Solar': [
        {
            'start': '2020-01-01',
            'end': 'last',
            'source': 'Pronovo',
            'series': 'Solar'
        },
        {
            'start': '2020-01-01',
            'end': 'last',
            'from_start': '2020-01-01',
            'from_end': 'last',
            'source': 'EC',
            'series': 'Solar'
        }
    ],
    'Wind': [
        {
            'start': '2020-01-01',
            'end': 'last',
            'source': 'Pronovo',
            'series': 'Wind'
        }
    ],
    'Waste': [
        {
            'start': '2020-05-01',
            'end': '2022-09-30',
            'source': 'Pronovo',
            'series': 'Waste_1'
        },
        {
            'start': '2022-10-01',
            'end': '2022-11-30',
            'from_start': '2021-10-01',
            'from_end': '2021-11-30',
            'source': 'EC',
            'series': 'Waste_1'
        },
        {
            'start': '2023-01-01',
            'end': 'last',
            'source': 'Pronovo',
            'series': 'Waste_1'
        }
    ],
    'Biogas': [
        {
            'start': '2020-05-01',
            'end': 'last',
            'source': 'Pronovo',
            'series': 'Biogas'
        }
    ],
    'Sewage_gas': [
        {
            'start': '2020-05-01',
            'end': 'last',
            'source': 'Pronovo',
            'series': 'Sewage_gas'
        }
    ],
    'Biomass_1_crops': [
        {
            'start': '2020-05-01',
            'end': '2022-12-31',
            'source': 'Pronovo',
            'series': 'Biomass_1_crops'
        },
        {
            'start': '2023-01-01',
            'end': 'last',
            'source': 'Pronovo',
            'series': 'Biomass_1_crops'
        }
    ],
    'Biomass_2_waste': [
        {
            'start': '2020-05-01',
            'end': '2021-12-31',
            'source': 'Pronovo',
            'series': 'Biomass_2_waste'
        },
        {
            'start': '2022-01-01',
            'end': '2022-12-31',
            'from_start': '2021-01-01',
            'from_end': '2021-12-31',
            'source': 'Pronovo',
            'series': 'Biomass_2_waste'
        },
        {
            'start': '2023-01-01',
            'end': 'last',
            'source': 'Pronovo',
            'series': 'Biomass_2_waste'
        }
    ],
    'Hydro_Pumped_Storage': [
        {
            'start': '2023-01-01',
            'end': 'last',
            'source': 'Pronovo',
            'series': 'Hydro_Pumped_Storage'
        }
    ],
    'Hydro_Pumpage': [
        {
            'start': '2023-01-01',
            'end': 'last',
            'source': 'Pronovo',
            'series': 'Hydro_Pumpage'
        }
    ]
}
"""
**The reorganization rules are only valid for the 2020-2024 period.**
    
A dict giving the mapping between the columns of the final data (result) and the source data from Pronovo and EnergyCharts. 
Used by the reorganize_enr_data method. 

Should follow the structure :

.. code-block:: python

    'Category1': [
        {
            'start': 'start date', # start date in the result (and in source data if from_start isn't set)
            'end': 'end date', # end date in the result (and in source data if from_end isn't set) (use 'last' to get the real end of the source data)
            'from_start': '2021-01-01', # optional, start date in the source data (allows to copy data from one date to another)
            'from_end': '2021-12-31', # optional, end data in the source data
            'source': 'Pronovo' or 'EC', # source of the data
            'series': 'Series_Name_In_Source_Data' # name of the column to take in the source data
        },
        ...
    ],
    ...
"""


[docs] def get_enr_data_from_pronovo_ec(path_dir, verbose=False): """ Reads the pronovo and energy charts data from the given directory, and returns a dataframe containing the reorganized data. The reorganized data is the best estimation of the real renewable electricity productions (solar, wind, waste...), from what is available. **The reorganization rules are only valid for the 2020-2022 period.** Parameters ---------- path_dir : str The path of the directory containing the pronovo and energy charts data. The directory should contain two subdirectories: - pronovo_data: containing the pronovo data (a 'prod_year' directory for each input) - ec_data: containing the energy charts data (annual files) See the documentation below for more details. verbose : bool, optional Whether to print debug information. The default is False. Returns ------- mapped_data : pd.DataFrame A dataframe containing the reorganized data, indexed by date. """ pronovo_data = read_enr_data_from_pronovo(path_dir, verbose=verbose) ec_data = read_enr_data_from_energy_charts(path_dir, verbose=verbose) mapped_data = reorganize_enr_data(pronovo_data, ec_data) return mapped_data
[docs] def read_enr_data_from_pronovo(path_dir, verbose=False): """ Reads all the pronovo data from the given directory, and returns a dataframe containing the data. Parameters ---------- path_dir : str The path of the directory containing the pronovo data. The directory should contain a subdirectory for each year, named 'prod_year' (e.g. 'prod_2020'). Each subdirectory should contain the pronovo data files of this year (.csv files). See the documentation below for more details. verbose : bool, optional Whether to print debug information. The default is False. Returns ------- pronovo_data : pd.DataFrame A dataframe containing the pronovo data, indexed by date. """ pronovo_dir = os.path.join(path_dir, 'pronovo_data') if not os.path.isdir(pronovo_dir): raise FileNotFoundError( f"Directory {pronovo_dir} doesn't exist. Please create it and add actual pronovo data directories (follow the procedure explain in the documentation).") # Read pronovo data years = [] for file in os.listdir(pronovo_dir): if file.startswith('prod_') and os.path.isdir(os.path.join(pronovo_dir, file)): years.append(file) if verbose: print(f'Reading pronovo directories: {years}') types = list(pronovo_types_map['2'].keys()) types.append('Biomass_all') pronovo_data = load_all_pronovo_files(root_dir=pronovo_dir + '/', types=types,verbose=verbose) return pronovo_data
[docs] def read_enr_data_from_energy_charts(path_dir, verbose=False): """ Reads all the energy charts data from the given directory, and returns a dataframe containing the data. Parameters ---------- path_dir : str The path of the directory containing the energy charts data. The directory should contain a subdirectory named 'ec_data'. The 'ec_data' directory should contain the yearly energy charts data files (.csv files). See the documentation below for more details. verbose : bool, optional Whether to print debug information. The default is False. Returns ------- df_ec_data : pd.DataFrame A dataframe containing the energy charts data, indexed by date. """ ec_dir = os.path.join(path_dir, 'ec_data') if not os.path.isdir(ec_dir): raise FileNotFoundError( f"Directory {ec_dir} doesn't exist. Please create it and add actual energy charts data files (follow the procedure explain in the documentation).") # Read EnergyCharts data ec_data = [] for f in os.listdir(ec_dir): if f.endswith('.csv'): if verbose: print('Reading ' + f) data = pd.read_csv(ec_dir + '/' + f, index_col=0) data = data.drop(index=data.index[0], columns=[col for col in data.columns if col.startswith('Unnamed')]) data = data.rename(columns=ec_types_to_types) ec_data.append(data) df_ec_data = pd.concat(ec_data, axis=0).fillna(0).astype(float) # create DatetimeIndex from D.M.Y format to match pronovo data df_ec_data.index = pd.to_datetime(df_ec_data.index, format='%d.%m.%Y') return df_ec_data
[docs] def reorganize_enr_data(pronovo_data: pd.DataFrame, ec_data: pd.DataFrame) -> pd.DataFrame: """ | Reorganizes the pronovo and energy charts data to match the final data format. | The reorganized data is the best estimation of the real renewable electricity productions, from what is available. | **The reorganization rules are only valid for the 2020-2024 period.** Parameters ---------- pronovo_data : pd.DataFrame The pronovo data, indexed by date. ec_data : pd.DataFrame The energy charts data, indexed by date. Returns ------- mapped_data : pd.DataFrame A dataframe containing the reorganized data, indexed by date. """ def _dedupe_index(df: pd.DataFrame) -> pd.DataFrame: """ | Deduplicates the index of the dataframe. :param df: DataFrame :return: A dataframe with deduplicated index """ df = df.sort_index() if not df.index.is_unique: # aggregates duplicated timestamps (e.g., DST fall-back hour) by sum df = df.groupby(level=0).sum() return df pronovo_data = _dedupe_index(pronovo_data) ec_data = _dedupe_index(ec_data) mapped_data = pd.DataFrame(index=pronovo_data.index, columns=data_mappings.keys()) real_end = ec_data.index[-1] if pronovo_data.index[-1] < real_end: real_end = pronovo_data.index[-1] real_end = str(real_end.date()) for col in data_mappings.keys(): maps = data_mappings[col] for mapping in maps: from_ec = mapping['source'] == 'EC' src_df = ec_data.copy() if from_ec else pronovo_data mapping_end_str = real_end if mapping.get('end') == 'last' else mapping['end'] start = datetime.strptime(mapping['start'], '%Y-%m-%d') end = datetime.strptime(mapping_end_str, '%Y-%m-%d') + pd.Timedelta(hours=23) if ('from_start' in mapping) or ('from_end' in mapping): from_start_str = mapping.get('from_start', mapping['start']) from_end_str = mapping.get('from_end', mapping_end_str) if from_end_str in ('last', 'end'): from_end_str = mapping_end_str from_start = datetime.strptime(from_start_str, '%Y-%m-%d') from_end = datetime.strptime(from_end_str, '%Y-%m-%d') + pd.Timedelta(hours=23) if from_ec: # Scale past pronovo hours to actual energy charts daily production prod_ec = src_df.loc[start:end, mapping['series']] prod_pronovo = pronovo_data.loc[from_start:from_end, mapping['series']].copy() prod_pronovo.index = mapped_data.loc[start:end, col].index daily_y = prod_pronovo.resample('D').sum() daily_y.index = prod_ec.index prod_ec = prod_ec * 1e6 # Convert to kWh factors = prod_ec / daily_y # Adjust the index to include the hours of the last day adjusted_dates = pd.date_range(start=factors.index[0], end=factors.index[-1] + pd.Timedelta(hours=23), freq='H') resampled_dates = factors.reindex(adjusted_dates, method='ffill') prod_pronovo = prod_pronovo.multiply(resampled_dates) else: # Directly copy from one date to another date, from Pronovo data prod_pronovo = src_df.loc[from_start:from_end, mapping['series']].copy() prod_pronovo.index = mapped_data.loc[start:end, col].index mapped_data.loc[start:end, col] = prod_pronovo else: if mapping['source'] == 'EC': # Convert daily EnergyCharts data to hourly data src_df = src_df * 1000000 # Convert to kWh src_df = src_df.resample( 'H').ffill() / 24 # Convert to hourly data (with a uniform repartition over the day in first approximation) print('Warning: uniform daily distribution of EC data was used for column', col) # simple copy mapped_data.loc[start:end, col] = src_df.loc[start:end, mapping['series']] return mapped_data
[docs] def load_all_pronovo_files(root_dir: str, types: [str], verbose: bool = False) -> pd.DataFrame: """ Loads all pronovo files in the given directories, applying daily scaling with energy charts ecd_enr_model (the hourly variation comes from the pronovo ecd_enr_model, and the daily total from energy charts ecd_enr_model, if available). The scaling is done with csv files starting by "EC". All other csv files are considered as pronovo files. :param root_dir: The root directory containing the pronovo 'prod_year' directories :param types: The types of plants to extract (in ['Wind', 'Solar']) :param verbose: Whether to print debug information :return: A dataframe containing the pronovo ecd_enr_model for all 'types', indexed by date """ Ys = [] for subdir, dirs, files in os.walk(root_dir): for file in files: if file.lower().endswith('.csv'): full_path = os.path.join(subdir, file) try: f_y = load_pronovo_file(full_path, types, verbose=verbose) Ys.append(f_y) except Exception as e: if verbose: print(f"[WARN] {full_path}: {e}") if not Ys: raise RuntimeError(f"Aucun CSV exploitable trouvé sous {root_dir}") pronovo_data = pd.concat(Ys).sort_index() return pronovo_data
def _parse_mixed_pronovo_index(idx_like: pd.Series) -> pd.DatetimeIndex: """ Parse the index which can contains: - datetimes 'dd.mm.yyyy HH:MM' - Excel numbers (44256.35 or 44'256.35) :param idx_like: A pandas Series or Index of strings/numbers representing timestamps in mixed formats. :return: A pandas DatetimeIndex with all valid timestamps parsed and rounded to the nearest 15 minutes. """ _DATE_STRICT_FMT = '%d.%m.%Y %H:%M' _DATE_RE = re.compile(r"^\s*\d{2}\.\d{2}\.\d{4}\s+\d{2}:\d{2}\s*$") _EXCEL_RE = re.compile(r"^\s*\d{2}'\d{3}(?:\.\d+)?\s*$|^\s*\d{5}(?:\.\d+)?\s*$") s = idx_like.astype(str).str.strip() mask_txt = s.str.match(_DATE_RE) dt = pd.to_datetime(s.where(mask_txt), format=_DATE_STRICT_FMT, errors='coerce', dayfirst=True) mask_xl = s.str.match(_EXCEL_RE) if mask_xl.any(): xl = (s.where(mask_xl) .str.replace("'", "", regex=False) # thousands Switzerland .str.replace(",", ".", regex=False)) xl_num = pd.to_numeric(xl, errors='coerce') dt_xl = pd.to_datetime(xl_num, unit='d', origin='1899-12-30', errors='coerce') dt_xl = dt_xl.dt.round('15min') dt = dt.fillna(dt_xl) if dt.isna().any(): dt_fallback = pd.to_datetime(s, errors='coerce', dayfirst=True) dt = dt.fillna(dt_fallback) return pd.DatetimeIndex(dt)
[docs] def load_pronovo_file(file: str, types: [str], verbose: bool = False) -> pd.DataFrame: """ Load pronovo ecd_enr_model from a csv file. Supports years from 2020 to 2022 (historically the format of the files changes every semester). :param file: the path of the file to load :param types: The types of plants to extract (in ['Wind', 'Solar']) :param verbose: Whether to print debug information :return: A dataframe containing the pronovo ecd_enr_model for all 'types', indexed by date """ if file.endswith('2020.csv'): format = 5 if int(file[-11:-9]) < 5 else 6 elif file.endswith('2021.csv'): format = 3 if int(file[-11:-9]) < 8 else 4 elif file.endswith('2022.csv'): format = 1 elif file.endswith('202303_CH_Total_Quartal_def.csv'): format = 7 else: format = 2 if verbose: print(f'Load fmt {format} {file}', end='\n') if format == 5: pronovo_data = pd.read_csv(f'{file}', index_col=0, skiprows=2, encoding='windows-1252', sep=';') pronovo_types = pronovo_types_map['5'] elif format == 6: pronovo_data = pd.read_csv(f'{file}', index_col=0, skiprows=10, encoding='windows-1252', sep=';') pronovo_types = pronovo_types_map['*'] elif format == 3: pronovo_data = pd.read_csv(f'{file}', index_col=0, skiprows=16, encoding='windows-1252', sep=';') pronovo_types = pronovo_types_map['*'] elif format == 4: pronovo_data = pd.read_csv(f'{file}', index_col=0, skiprows=18, encoding='windows-1252', sep=';') pronovo_types = pronovo_types_map['*'] elif format == 1: pronovo_data = pd.read_csv(f'{file}', index_col=0, skiprows=17, encoding='windows-1252', sep=';') pronovo_types = pronovo_types_map['*'] elif format == 2: pronovo_data = pd.read_csv(f'{file}', index_col=1, skiprows=1, encoding='windows-1252', sep=';') pronovo_types = pronovo_types_map[str(format)] elif format == 7: pronovo_data = pd.read_csv(f'{file}', index_col=1, skiprows=1, encoding='windows-1252', sep=';') pronovo_types = pronovo_types_map[str(format)] else: raise Exception('Unknown format') dt_idx = _parse_mixed_pronovo_index(pronovo_data.index.to_series()) valid = dt_idx.notna() if not valid.any(): raise ValueError(f"No valid date detected in {file}") pronovo_data = pronovo_data.loc[valid].copy() pronovo_data.index = dt_idx[valid] pronovo_types_a = [pronovo_types[tpe] for tpe in types if tpe in pronovo_types and pronovo_types[tpe] in pronovo_data.columns] pronovo_data = pronovo_data[pronovo_types_a] pronovo_types_inv = {v: k for k, v in pronovo_types.items()} for i in range(len(types)): pronovo_data.rename(columns=pronovo_types_inv, inplace=True) pronovo_data = pronovo_data.applymap( lambda x: float(x) if type(x) != str else float(x.replace('\'', '').replace('’', ''))) pronovo_data = pronovo_data.resample('H').sum() pronovo_data = pronovo_data.iloc[:-1] # last value is first hour if the next month return pronovo_data