import os
import random
import sys

from dataclasses import dataclass
from enum import Enum
from functools import wraps

import matplotlib.pyplot as plt
import matplotlib.axes._axes as type_ax
import numpy as np
import optuna
import optuna.importance as optuna_importance
import pandas as pd
import phik
import seaborn as sns
import shap
import torch
import torch.nn as nn
import torch.optim as optim

from catboost import CatBoostRegressor, __version__ as catboost_v
from sklearn import __version__ as sklearn_v
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import KFold, train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sqlalchemy import create_engine, text, __version__ as sqlalchemy_v
from sqlalchemy.engine import Result
from sqlalchemy.engine.base import Connection
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import Dataset, DataLoader
from torchinfo import summary, __version__ as torchinfo_v


"Python v" + sys.version.split(" (")[0]

'Python v3.11.1'

print("catboost:   ", catboost_v)
print("numpy:      ", np.__version__)
print("optuna:     ", optuna.__version__)
print("pandas:     ", pd.__version__)
print("phik:       ", phik.__version__)
print("seaborn:    ", sns.__version__)
print("shap:       ", shap.__version__)
print("sklearn:    ", sklearn_v)
print("sqlalchemy: ", sqlalchemy_v)
print("torch:      ", torch.__version__)
print("torchinfo:  ", torchinfo_v)

catboost:    1.2.7
numpy:       1.26.4
optuna:      4.2.1
pandas:      2.1.4
phik:        0.12.4
seaborn:     0.13.2
shap:        0.46.0
sklearn:     1.4.1.post1
sqlalchemy:  2.0.36
torch:       2.5.1
torchinfo:   1.8.0

SEED = 1702025  # 2025-02-17
DB_PATH = "en.db"
FMT_SECONDS = "\'%s\'"
DEV = False
RF_TRIALS = 2 if DEV else 20
CATB_TRIALS = 2 if DEV else 20
NN_TRIALS = 2 if DEV else 20


if DEV:
    display(os.getcwd())

class Columns:
    fk = "key"

    def over(self, op: str, as_: str = "", asc: str = "ASC", **kw) -> str:
        order_by = f'ORDER BY {getattr(self, "value")}'
        part_by, as_x = kw.get("partition_by", self.fk), f" AS {as_}" if as_ else as_
        return f'{op} OVER (PARTITION BY {part_by} {order_by} {asc}){as_x}'

    def row_number(self, as_: str, *args, **kwargs) -> str:
        return self.over("ROW_NUMBER()", as_, *args, **kwargs)

    @property
    def lag(cls: Enum):
        return f'LAG({cls.value})'

    @property
    def alias(cls: Enum):
        return f'{cls.value} AS {cls.name.lower()}'

    @classmethod
    def columns(cls, alias: bool = False, add: list[str] = []) -> tuple[str, ...]:
        if alias:
            return tuple([cls.fk, *add, *(getattr(k, "alias") for k in cls)])
        return tuple([cls.fk, *add, *(str(getattr(k, "name")).lower() for k in cls)])
    
    @staticmethod
    def strftime(column, *args, fmt: str, as_: str = ""):
        strftime_ = f'STRFTIME({fmt}, {column})' + (f" AS {as_}" if as_ else as_)
        return "".join([strftime_, *args])

class Arc(Columns, Enum):
    BEGIN_HEAT = 'DATETIME("BeginHeat")'
    END_HEAT = 'DATETIME("EndHeat")'
    ACTIVE_POWER = '"ActivePower"'
    REACTIVE_POWER = '"ReactivePower"'


class Gas(Columns, Enum):
    GAS_1 = "Gas1"


class Temp(Columns, Enum):
    MEAUSURE_TIME = 'DATETIME("MeasureTime")'
    TEMPERATURE = '"Temperature"'
    

Arc.columns(), Gas.columns(), Temp.columns()

(('key', 'begin_heat', 'end_heat', 'active_power', 'reactive_power'),
 ('key', 'gas_1'),
 ('key', 'meausure_time', 'temperature'))

class DatabaseConfig:
    def __init__(self, path: str):
        """src: https://gist.github.com/eugen-hoppe/485201df52b69397bf44ecee2d0effc8"""
        self.path = path
        self.cache_sql: list = None

    def __create_engine__(self):
        return create_engine(f"sqlite:///{self.path}", echo=False)

    def db_connection(self, func):
        engine = self.__create_engine__()
        @wraps(func)
        def wrapper(*args, **kwargs):
            with engine.connect() as connection:
                try:
                    return func(connection, *args, **kwargs)
                except Exception as e:
                    raise e
        return wrapper
    
    def print_statement(self) -> None:
        print("\n" + " ".join(self.cache_sql) + ";" + "\n")

    def print_statement_pretty(self, *br, sep: str = "\n    ", **shift) -> None:
        sql_ = sep.join(self.cache_sql).replace(f"    FROM{sep}", "FROM ")
        for br_id, (keyword, shift) in enumerate(shift.items()):
            kw_ = keyword.upper().replace("_", " ")
            sql_ = sql_.replace(kw_, br[br_id] + shift * " " + kw_)
        print("\n" + sql_ + ";" + "\n")

    @staticmethod
    def get_df(io: Result) -> pd.DataFrame:
        return pd.DataFrame([tuple(r) for r in io.fetchall()], columns=list(io.keys()))

sql = DatabaseConfig(path=DB_PATH)


@sql.db_connection
def select(conn: Connection, *columns, from_: str = "", **kw) -> pd.DataFrame:
    """src: https://gist.github.com/eugen-hoppe/485201df52b69397bf44ecee2d0effc8"""
    c_ = [col + ", " for col in columns][:-1] + [columns[-1]] if len(columns) else ["*"]
    kw_fx = lambda x: (
        (str(x).upper().replace("_", " ") + " ").replace("  ", " ").replace("  ", " ")
    )
    sql.cache_sql = ["SELECT", *c_, "FROM", from_, *[kw_fx(k) + str(kw[k]) for k in kw]]
    return DatabaseConfig.get_df(conn.execute(text(" ".join(sql.cache_sql) + ";")))

@dataclass
class DataFramesFromDB:
    data_arc: pd.DataFrame = None
    data_bulk: pd.DataFrame = None
    data_bulk_time: pd.DataFrame = None
    data_gas: pd.DataFrame = None
    data_temp: pd.DataFrame = None
    data_wire: pd.DataFrame = None
    data_wire_time: pd.DataFrame = None

# Reproducibility
# ===============
random.seed(SEED)
np.random.seed(SEED)  # Set NumPy random seed
torch.manual_seed(SEED)  # Set PyTorch random seed
torch.cuda.manual_seed(SEED)  # Ensure reproducibility on GPUs (if available)
torch.cuda.manual_seed_all(SEED)  # If using multi-GPU
torch.backends.cudnn.deterministic = True  # Ensure deterministic behavior
torch.backends.cudnn.benchmark = False
optuna_sampler = optuna.samplers.TPESampler(seed=SEED)


# Logging
# =======
optuna.logging.set_verbosity(optuna.logging.WARNING)


SEED

1702025

from_db = DataFramesFromDB()

from_db.data_arc = select(from_="data_arc")
from_db.data_arc.columns = Arc.columns()
sql.print_statement()
from_db.data_arc.head()

SELECT * FROM data_arc;

# Window Queries
# ==============
lag_query =  Columns.strftime(
    Arc.BEGIN_HEAT.value,  # .                           A: begin_heat @ step x
    " - ",
    Columns.strftime(  # .                               B: end_heat   @ step x-1
        Arc.BEGIN_HEAT.over(Arc.END_HEAT.lag), as_="no_heat_seconds", fmt=FMT_SECONDS,
    ),
    fmt=FMT_SECONDS,
)
over_queries = [
    Arc.BEGIN_HEAT.row_number("step"),
    Arc.END_HEAT.row_number("step_end"),
    lag_query,  # .                                      C: no_heat_seconds = A - B
]


# Main Query
# ==========
from_db.data_arc = select(*Arc.columns(alias=True, add=over_queries), from_="data_arc")

sql.print_statement_pretty("\n", over=8)
from_db.data_arc.head(3)

SELECT
    key, 
    ROW_NUMBER() 
        OVER (PARTITION BY key ORDER BY DATETIME("BeginHeat") ASC) AS step, 
    ROW_NUMBER() 
        OVER (PARTITION BY key ORDER BY DATETIME("EndHeat") ASC) AS step_end, 
    STRFTIME('%s', DATETIME("BeginHeat")) - STRFTIME('%s', LAG(DATETIME("EndHeat")) 
        OVER (PARTITION BY key ORDER BY DATETIME("BeginHeat") ASC)) AS no_heat_seconds, 
    DATETIME("BeginHeat") AS begin_heat, 
    DATETIME("EndHeat") AS end_heat, 
    "ActivePower" AS active_power, 
    "ReactivePower" AS reactive_power
FROM data_arc;

if sum(from_db.data_arc["step"] - from_db.data_arc["step_end"]) == 0:
    from_db.data_arc.drop(columns="step_end", inplace=True)  # Drop redundant column

from_db.data_arc.head(8)

from_db.data_gas = select(from_="data_gas")
from_db.data_gas.columns = Gas.columns()
sql.print_statement()
from_db.data_gas.head()

SELECT * FROM data_gas;

from_db.data_temp = select(
    *Temp.columns(alias=True, add=[Temp.MEAUSURE_TIME.row_number("step")]),
    from_="data_temp",
)
sql.print_statement_pretty()
from_db.data_temp.head(8)

SELECT
    key, 
    ROW_NUMBER() OVER (PARTITION BY key ORDER BY DATETIME("MeasureTime") ASC) AS step, 
    DATETIME("MeasureTime") AS meausure_time, 
    "Temperature" AS temperature
FROM data_temp;

from_db.data_bulk = select(from_="data_bulk")
from_db.data_bulk.columns = from_db.data_bulk.columns.str.lower().str.replace(" ", "_")
sql.print_statement()
from_db.data_bulk.head()

SELECT * FROM data_bulk;

from_db.data_bulk_time = select(from_="data_bulk_time")
from_db.data_bulk_time.columns = (
    from_db.data_bulk_time.columns.str.lower().str.replace(" ", "_")
)
sql.print_statement()
display(from_db.data_bulk_time.T.head(9).T.head(3))
from_db.data_bulk_time.T.tail(7).T.head(3)

SELECT * FROM data_bulk_time;

from_db.data_wire = select(from_="data_wire")
from_db.data_wire.columns = from_db.data_wire.columns.str.lower().str.replace(" ", "_")
sql.print_statement()
from_db.data_wire.head()

SELECT * FROM data_wire;

from_db.data_wire_time = select(from_="data_wire_time")
from_db.data_wire_time.columns = (
    from_db.data_wire_time.columns.str.lower().str.replace(" ", "_")
)
sql.print_statement()
from_db.data_wire_time.head()

SELECT * FROM data_wire_time;

@dataclass
class XColumns:  # .              DATATYPE   DESCRIPTION
    base_int: list[str]
    base_float: list[str]
    base_str: list[str]
    arc: list[str]  # .           int        time in seconds - t{odd}
    no_arc: list[str]  # .        int        time in seconds - t{even}
    active: list[str]  # .        float      active power:   - p{odd}
    reactive: list[str]  # .      float      reactive power  - p{even} (negative values)
    bulk: list[str]  # .          float
    bulk_t: list[str]  # .        int        time in seconds
    wire: list[str]  # .          float
    wire_t: list[str]  # .        int        time in seconds        ___________________
    apparent: list[str]  # .      float      active|reactive - s = √ p{odd}² + p{even}²`
    power_x: list[str] # .        float      active|apparent - pf = p{odd} / s{i}
    sums: list[str]

    @staticmethod
    def display_subsets(df: pd.DataFrame, subsets: list[list[str]], head: int = 1):
        for subset in subsets:
            display(df[subset].head(head))


@dataclass
class Xy:
    cols: XColumns
    y_cols: list[str]
    synth: pd.DataFrame = None
    X_train: pd.DataFrame = None
    y_train: pd.DataFrame = None
    X_test: pd.DataFrame = None
    y_test: pd.DataFrame = None


def describe(dev: bool = DEV, **kwdf: pd.DataFrame) -> None:
    if dev:
        for for_col, to_describe in kwdf.items():
            display(to_describe[for_col].describe())

x_columns = XColumns(
    base_int=["tx", "t_delta", "t_delta_w"],
    base_float=["input_temp", "gas_1"],
    base_str=["bulks", "t_pos", "wires", "t_pos_w"],
    arc=[f"t{i}" for i in range(1, 32, 2)],
    no_arc=[f"t{i}" for i in range(2, 33, 2)],
    active=[f"p{i}" for i in range(1, 32, 2)],
    reactive=[f"p{i}" for i in range(2, 33, 2)],
    bulk=[f"bulk_{i}" for i in range(1, 16, 1)],
    bulk_t=[f"bulk_{i}t" for i in range(1, 16, 1)],
    wire=[f"wire_{i}" for i in range(1, 10, 1)],
    wire_t=[f"wire_{i}t" for i in range(1, 10, 1)],
    apparent=[f"s{i}" for i in range(1, 17, 1)],
    power_x=[f"pf{i}" for i in range(1, 17, 1)],
    sums=["arc_sum", "no_arc_sum", "apparent_sum", "power_x_sum"]
)
xy = Xy(cols=x_columns, y_cols=["target"])

@dataclass
class Plot:
    """src: https://gist.github.com/eugen-hoppe/400e47930f733229eaf773f070c81e62"""
    df: pd.DataFrame  # .                            Cache pd.Dataframe for plot 
    attribute: list[str]  # .                        [ {df-column-name}, {plot-label} ]
    color: tuple[str, str] = ("gray", "black")  # .  ( color, edgecolor, )
    figsize: tuple[int, int] = (10, 3)

    def histogram(self, bins: int = 16, ax: type_ax.Axes = None, **kwargs) -> None:
        if ax is None:
            _, ax = plt.subplots(figsize=self.figsize)
        ax.hist(
            self.df[self.attribute[0]],
            bins=bins,
            color=self.color[0],
            edgecolor=self.color[1],
            **kwargs
        )
        ax.set_xlabel(self.attribute[1])
        ax.set_ylabel("Frequency")
        ax.set_title(self.attribute[-1])
        ax.grid(visible=False)

    def boxplot(self, ax: type_ax.Axes = None, **kwargs) -> None:
        if ax is None:
            _, ax = plt.subplots(figsize=self.figsize)
        ax.boxplot(self.df[self.attribute[0]], vert=False, **kwargs)
        ax.set_title("Boxplot")
        ax.set_ylabel(self.attribute[0])

    def histogram_and_boxplot(self, bins: int = 16, bpkw: dict = {}, **kwargs) -> None:
        _, axes = plt.subplots(1, 2, figsize=self.figsize)
        self.histogram(bins, axes[0], **kwargs)
        self.boxplot(axes[1], **bpkw)
        plt.show()

@dataclass
class Aggregated(DataFramesFromDB):
    combined: pd.DataFrame = None
    bulk: pd.DataFrame = None
    wire: pd.DataFrame = None

aggr = Aggregated()
cache = dict()

from_db.data_arc["begin_heat"] = pd.to_datetime(from_db.data_arc["begin_heat"])
from_db.data_arc["end_heat"] = pd.to_datetime(from_db.data_arc["end_heat"])
from_db.data_arc["seconds"] = (
    (from_db.data_arc["end_heat"] - from_db.data_arc["begin_heat"]).dt.seconds
)
from_db.data_arc["tx"] = 1 
from_db.data_arc["max"] = from_db.data_arc.groupby("key")["tx"].transform("sum")
from_db.data_arc.info()
from_db.data_arc.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14876 entries, 0 to 14875
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   key              14876 non-null  int64         
 1   step             14876 non-null  int64         
 2   no_heat_seconds  11662 non-null  float64       
 3   begin_heat       14876 non-null  datetime64[ns]
 4   end_heat         14876 non-null  datetime64[ns]
 5   active_power     14876 non-null  float64       
 6   reactive_power   14876 non-null  float64       
 7   seconds          14876 non-null  int32         
 8   tx               14876 non-null  int64         
 9   max              14876 non-null  int64         
dtypes: datetime64[ns](2), float64(3), int32(1), int64(4)
memory usage: 1.1 MB

arc_max_value_counts = from_db.data_arc["max"].value_counts()
cache["data_arc_max_steps"] = (
    arc_max_value_counts.sort_index().tail(1).item()
)
plot_arc = Plot(from_db.data_arc.copy(), attribute=["max", "Steps", "Maximal Steps"])
plot_arc.histogram(bins=cache["data_arc_max_steps"] - 1)
describe(max=from_db.data_arc)
arc_max_value_counts.to_frame().T

plot_arc.attribute = ["seconds", "Seconds", "Heat Duration"]
describe(seconds=from_db.data_arc)
plot_arc.histogram_and_boxplot(30)

plot_arc.attribute = ["no_heat_seconds", "Seconds", "Arc Idle Time"]
describe(no_heat_seconds=from_db.data_arc)
plot_arc.histogram(200, range=(0, 15_000))

from_db.data_arc["active_power"].describe().to_frame()

plot_arc.attribute = ["active_power", "value", "Active Power"]
plot_arc.histogram_and_boxplot(30)

from_db.data_arc["reactive_power"].describe().to_frame()

from_db.data_arc[from_db.data_arc["reactive_power"] <= 0]

from_db.data_arc = from_db.data_arc[from_db.data_arc["reactive_power"] > 0]  # drop
from_db.data_arc["reactive_power"].describe().to_frame()

plot_arc.df = from_db.data_arc.copy()
plot_arc.attribute = ["reactive_power", "value", "Reactive Power"]
describe(reactive_power=from_db.data_arc)
plot_arc.histogram_and_boxplot(30)

aggr.data_arc = from_db.data_arc.copy()
change_sign_for_even = -1  # Negative values for no_heat_seconds and reactive_power

transform_columns = [
    f"t{prod_step}" for prod_step in range(1, cache["data_arc_max_steps"] * 2 + 1)
]
power_columns = [
    f"p{prod_step}" for prod_step in range(1, cache["data_arc_max_steps"] * 2 + 1)
]
aggr.data_arc[transform_columns] = np.nan
aggr.data_arc[power_columns] = np.nan


for prod_step in range(1, cache["data_arc_max_steps"] + 1):
    t_odd_col, t_even_col = f"t{(prod_step * 2) - 1}", f"t{prod_step * 2 - 2}"
    p_odd_col, p_even_col = f"p{(prod_step * 2) - 1}", f"p{prod_step * 2}"
    step_value = (prod_step - 1) * 2  # Mapping step to correct row indexes
    step_q = (aggr.data_arc["step"] == prod_step)  # Select the row for the current step
    aggr.data_arc.loc[step_q, t_odd_col] = aggr.data_arc["seconds"]  # Fill `t{odd}`
    aggr.data_arc.loc[step_q, p_odd_col] = aggr.data_arc["active_power"]
    aggr.data_arc.loc[step_q, t_even_col] = (  # Fill `t{even}` 
        change_sign_for_even * aggr.data_arc["no_heat_seconds"]
    ) 
    aggr.data_arc.loc[step_q, p_even_col] = (
        change_sign_for_even * aggr.data_arc["reactive_power"]
    )


for prod_step in range(1, cache["data_arc_max_steps"] * 2 + 1):  # Fill NaN with zeros
    aggr.data_arc[f"t{prod_step}"].fillna(0, inplace=True)
    aggr.data_arc[f"t{prod_step}"] = aggr.data_arc[f"t{prod_step}"].astype(int)
    aggr.data_arc[f"p{prod_step}"].fillna(0.0, inplace=True)


aggr.data_arc.drop(columns=["t0"], inplace=True)  # temporary created by logic above

from_col_1 = [
    "key", "step", "max", "begin_heat", "end_heat", "no_heat_seconds", "seconds"
]
q_transform_view_1 = [*from_col_1, *[f"t{t}" for t in range(1, 33)]]
display(aggr.data_arc[q_transform_view_1].T.head(12).T.head())
print(". . .")
aggr.data_arc[q_transform_view_1].T.tail(15).T.head()

. . .

from_col_2 = ["key", "step", "max", "active_power", "reactive_power"]
q_transform_view_2 = [*from_col_2, *[f"p{p}" for p in range(1, 33)]]
display(aggr.data_arc[q_transform_view_2].T.head(10).T.head())
print(". . .")
aggr.data_arc[q_transform_view_2].T.tail(15).T.head()

. . .

drop_columns_time = ["begin_heat", "end_heat", "no_heat_seconds", "seconds"]
drop_columns_power = ["active_power", "reactive_power"]
aggr.data_arc["dt"] = aggr.data_arc.groupby("key")["begin_heat"].transform("min")
aggr.data_arc.drop(
    columns=["step", "max", *drop_columns_time, *drop_columns_power], inplace=True
)
cache["=*x"] = "=" * 100
key_tx_dt = ["key", "tx", "dt"]
print("Table: 'data_arc' | Rows:", len(aggr.data_arc))
print(cache["=*x"] )
XColumns.display_subsets(
    aggr.data_arc,
    [key_tx_dt, x_columns.arc, x_columns.no_arc, x_columns.active, x_columns.reactive],
    head=2
)
print(cache["=*x"], "\n")
print("data_arc[ view @ arc ]:")
aggr.data_arc[x_columns.arc].head()

Table: 'data_arc' | Rows: 14875
====================================================================================================

==================================================================================================== 

data_arc[ view @ arc ]:

assert from_db.data_arc["begin_heat"].duplicated().sum() == 0  # origin step timestamps
aggr.data_arc = aggr.data_arc.groupby(["key","dt"]).sum()
aggr.data_arc.reset_index(inplace=True)
aggr.data_arc.set_index("dt", inplace=True)
aggr.data_arc.index.name = None

print("Table 'data_arc' | Rows (after aggegation):", len(aggr.data_arc))
print(cache["=*x"])
XColumns.display_subsets(
    aggr.data_arc,
    [
        ["key", "tx"],
        x_columns.arc,
        x_columns.no_arc,
        x_columns.active[:10],
        x_columns.active[10:],
        x_columns.reactive[:10],
        x_columns.reactive[10:],
    ],
    head=2
)
print(cache["=*x"], "\n")
print("data_arc[ view @ arc ] (after aggegation):")
aggr.data_arc[x_columns.arc].head()

Table 'data_arc' | Rows (after aggegation): 3214
====================================================================================================

==================================================================================================== 

data_arc[ view @ arc ] (after aggegation):

from_db.data_temp["meausure_time"] = pd.to_datetime(from_db.data_temp["meausure_time"])
from_db.data_temp["temperature"] = from_db.data_temp["temperature"].astype(float)
from_db.data_temp = from_db.data_temp.dropna(subset=["temperature"])  # [^1]
from_db.data_temp.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14665 entries, 0 to 18086
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   key            14665 non-null  int64         
 1   step           14665 non-null  int64         
 2   meausure_time  14665 non-null  datetime64[ns]
 3   temperature    14665 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(2)
memory usage: 572.9 KB

from_db.data_temp["max"] = from_db.data_temp.groupby("key")["step"].transform("max")
from_db.data_temp.head()

plot_temp = Plot(from_db.data_temp.copy(), attribute=["max", "Steps", "Maximal Steps"])
plot_temp.histogram()  # [^1]: Anomaly at 1 because of dropna temperature
describe(max=from_db.data_temp)
from_db.data_temp["max"].value_counts().sort_index().to_frame().T  # [^2]

from_db.data_temp.info()

# Drop Anomalies
# --------------
from_db.data_temp = from_db.data_temp[from_db.data_temp["max"] > 1]  # [^2]
from_db.data_temp = from_db.data_temp[from_db.data_temp["temperature"] > 1500]  # [^3]

# [^2]: Drop because input_temp is feature and can not be target if only 1 record esxist
# [^3]: Temperature under 1500 is not possible

display(from_db.data_temp.tail())
from_db.data_temp["max"].value_counts().sort_index().to_frame().T

<class 'pandas.core.frame.DataFrame'>
Index: 14665 entries, 0 to 18086
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   key            14665 non-null  int64         
 1   step           14665 non-null  int64         
 2   meausure_time  14665 non-null  datetime64[ns]
 3   temperature    14665 non-null  float64       
 4   max            14665 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(3)
memory usage: 687.4 KB

def transform_last(x_: pd.Series) -> float:
    return x_.iloc[-1] if pd.notna(x_.iloc[-1]) else -1.0


from_db.data_temp["input_temp"] = (
    from_db.data_temp.groupby("key")["temperature"].transform("first")
)
from_db.data_temp["target"] = (
    from_db.data_temp.groupby("key")["temperature"].transform(transform_last)
)
from_db.data_temp.head()

plot_temp.df = from_db.data_temp.copy()  # update
plot_temp.attribute = ["temperature", "Temperature"]
describe(temperature=from_db.data_temp)
plot_temp.histogram_and_boxplot(50)

aggr.data_temp = from_db.data_temp.copy()

temp_col = ["key", "max", "input_temp", "target"]
aggr.data_temp = aggr.data_temp.groupby("key")[temp_col].first().reset_index(drop=True)
aggr.data_temp.info()
aggr.data_temp.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2475 entries, 0 to 2474
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   key         2475 non-null   int64  
 1   max         2475 non-null   int64  
 2   input_temp  2475 non-null   float64
 3   target      2475 non-null   float64
dtypes: float64(2), int64(2)
memory usage: 77.5 KB

plot_temp.df = aggr.data_temp.copy()
plot_temp.attribute = ["input_temp", "Input Temperature", "Temperature (Feature)"]
describe(input_temp=from_db.data_temp)
plot_temp.histogram_and_boxplot(50)

plot_temp.attribute = ["target", "Target Temperature", "Temperature (Target)"]
describe(target=from_db.data_temp)
plot_temp.histogram_and_boxplot(50)

aggr.data_gas = from_db.data_gas.copy()
aggr.data_gas.info()
aggr.data_gas.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3239 entries, 0 to 3238
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   key     3239 non-null   int64  
 1   gas_1   3239 non-null   float64
dtypes: float64(1), int64(1)
memory usage: 50.7 KB

aggr.data_gas["gas_1"].describe().to_frame()

plot_gas = Plot(aggr.data_gas.copy(), ["gas_1", "Gas"])
describe(gas_1=from_db.data_gas)
plot_gas.histogram_and_boxplot(30)

def encode_combinations(row, prefix: str = "bulk_", stop: int = 16) -> str:
    non_null_bulk = [str(i) for i in range(1, stop) if pd.notna(row[f"{prefix}{i}"])]
    return "-".join(non_null_bulk)


def encode_positions(row: pd.Series, prefix: str = "bulk_", stop: int = 16) -> str:
    non_null_bulks = [i for i in range(1, stop) if pd.notna(row[f"{prefix}{i}"])]
    if not non_null_bulks:
        return ""
    bulk_to_time = {i: row[f"{prefix}{i}"] for i in non_null_bulks}
    sorted_desc = sorted(
        non_null_bulks, key=lambda b: (bulk_to_time[b], b), reverse=False
    )
    ranks_desc, rank_map, rank_counter = {}, {}, 1
    for bulk_index in sorted_desc:
        if bulk_to_time[bulk_index] not in rank_map:
            rank_map[bulk_to_time[bulk_index]] = rank_counter
            rank_counter += 1
        ranks_desc[bulk_index] = rank_map[bulk_to_time[bulk_index]]
    return "-".join(str(rank) for rank in [ranks_desc[i] for i in non_null_bulks])


def compute_time_delta(row: pd.Series, prefix: str = "bulk_", stop: int = 16):
    timestamps = [
        row[f"{prefix}{i}"] for i in range(1, stop) if pd.notna(row[f"{prefix}{i}"])
    ]
    if len(timestamps) < 2:
        return pd.Timedelta(0)
    return max(timestamps) - min(timestamps)

def check_combinations(df_1: pd.DataFrame, df_2: pd.DataFrame, prefix: str = "bulk"):
    df_ = df_1.merge(
        df_2[["key", f"{prefix}s"]],  # Only take key andf {prefix}s fromf {prefix}_time
        on="key", suffixes=(f"_{prefix}", f"_{prefix}_time")
    )
    df_[f"{prefix}s_match"] = (
        df_[f"{prefix}s_{prefix}"] == df_[f"{prefix}s_{prefix}_time"]
    )
    mismatch_count = (~df_[f"{prefix}s_match"]).sum()  # Count mismatches
    print(f"Total keys checked: {len(df_)}")
    print(f'Matching keys: {(df_[f"{prefix}s_match"]).sum()}')
    print(f"Mismatching keys: {mismatch_count}")

aggr.data_bulk = from_db.data_bulk.copy()
bulk_columns = [f"bulk_{n}" for n in range(1, 16)]
aggr.data_bulk[bulk_columns] = aggr.data_bulk[bulk_columns].astype(float)
aggr.data_bulk.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3129 entries, 0 to 3128
Data columns (total 16 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   key      3129 non-null   int64  
 1   bulk_1   252 non-null    float64
 2   bulk_2   22 non-null     float64
 3   bulk_3   1298 non-null   float64
 4   bulk_4   1014 non-null   float64
 5   bulk_5   77 non-null     float64
 6   bulk_6   576 non-null    float64
 7   bulk_7   25 non-null     float64
 8   bulk_8   1 non-null      float64
 9   bulk_9   19 non-null     float64
 10  bulk_10  176 non-null    float64
 11  bulk_11  177 non-null    float64
 12  bulk_12  2450 non-null   float64
 13  bulk_13  18 non-null     float64
 14  bulk_14  2806 non-null   float64
 15  bulk_15  2248 non-null   float64
dtypes: float64(15), int64(1)
memory usage: 391.2 KB

aggr.data_bulk["bulks"] = aggr.data_bulk.apply(encode_combinations, axis=1)
print("Unique bulk combinations:", len(aggr.data_bulk["bulks"].unique()))
display(aggr.data_bulk.T.head(9).T.head())
print(". . .")
aggr.data_bulk.T.tail(7).T.head()

Unique bulk combinations: 143

. . .

aggr.data_bulk_time = from_db.data_bulk_time.copy()

for col in bulk_columns:
    aggr.data_bulk_time[col] = pd.to_datetime(aggr.data_bulk_time[col], errors="coerce")

aggr.data_bulk_time.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3129 entries, 0 to 3128
Data columns (total 16 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   key      3129 non-null   int64         
 1   bulk_1   252 non-null    datetime64[ns]
 2   bulk_2   22 non-null     datetime64[ns]
 3   bulk_3   1298 non-null   datetime64[ns]
 4   bulk_4   1014 non-null   datetime64[ns]
 5   bulk_5   77 non-null     datetime64[ns]
 6   bulk_6   576 non-null    datetime64[ns]
 7   bulk_7   25 non-null     datetime64[ns]
 8   bulk_8   1 non-null      datetime64[ns]
 9   bulk_9   19 non-null     datetime64[ns]
 10  bulk_10  176 non-null    datetime64[ns]
 11  bulk_11  177 non-null    datetime64[ns]
 12  bulk_12  2450 non-null   datetime64[ns]
 13  bulk_13  18 non-null     datetime64[ns]
 14  bulk_14  2806 non-null   datetime64[ns]
 15  bulk_15  2248 non-null   datetime64[ns]
dtypes: datetime64[ns](15), int64(1)
memory usage: 391.2 KB

display(aggr.data_bulk_time.T.head(9).T.head())
print(". . .")
aggr.data_bulk_time.T.tail(7).T.head()

. . .

aggr.data_bulk_time["bulks"] = aggr.data_bulk_time.apply(encode_combinations, axis=1)
aggr.data_bulk_time["t_pos"] = aggr.data_bulk_time.apply(encode_positions, axis=1)
aggr.data_bulk_time["t_delta"] = aggr.data_bulk_time.apply(compute_time_delta, axis=1)
print("Unique bulk (time) combinations:", len(aggr.data_bulk_time["bulks"].unique()))
print(
    "Unique bulk sequence sub-combinations:",
    len((aggr.data_bulk_time["bulks"] + aggr.data_bulk_time["t_pos"]).unique()),
)
XColumns.display_subsets(
    aggr.data_bulk_time,
    subsets=[
        ["key", "bulks", "t_pos", "t_delta"], x_columns.bulk[:10], x_columns.bulk[10:]
    ],
    head=5,
)

Unique bulk (time) combinations: 143
Unique bulk sequence sub-combinations: 604

check_combinations(aggr.data_bulk, aggr.data_bulk_time)

Total keys checked: 3129
Matching keys: 3129
Mismatching keys: 0

aggr.bulk = aggr.data_bulk.merge(
    aggr.data_bulk_time.drop(columns=["bulks"]),  # Mismatching keys: 0 -> drop
    on="key",
    suffixes=("", "t")  # To differentiate columns from both dataframes
)
XColumns.display_subsets(
    aggr.bulk,
    subsets=[
        ["key", "bulks", "t_pos", "t_delta"],
        x_columns.bulk[:10],
        x_columns.bulk[10:],
        x_columns.bulk_t[:10],
        x_columns.bulk_t[10:],
    ],
    head=3,
)

plot_bulk = Plot(
    aggr.bulk.copy(), attribute=["t_delta", "Seconds", "Time Delta (Bulk)"]
)
plot_bulk.df["t_delta"] = plot_bulk.df["t_delta"].dt.total_seconds()
describe(t_delta=plot_bulk.df)
plot_bulk.histogram(100, range=(1, 15_000))  # Distribution like Arc Idle

aggr.data_wire = from_db.data_wire.copy()
wire_columns = [f"wire_{n}" for n in range(1, 10)]
aggr.data_wire[wire_columns] = aggr.data_wire[wire_columns].astype(float)
aggr.data_wire.info()
aggr.data_wire.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3081 entries, 0 to 3080
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   key     3081 non-null   int64  
 1   wire_1  3055 non-null   float64
 2   wire_2  1079 non-null   float64
 3   wire_3  63 non-null     float64
 4   wire_4  14 non-null     float64
 5   wire_5  1 non-null      float64
 6   wire_6  73 non-null     float64
 7   wire_7  11 non-null     float64
 8   wire_8  19 non-null     float64
 9   wire_9  29 non-null     float64
dtypes: float64(9), int64(1)
memory usage: 240.8 KB

aggr.data_wire["wires"] = aggr.data_wire.apply(
    encode_combinations, prefix="wire_", stop=10, axis=1
)
print("Unique wire combinations:", len(aggr.data_wire["wires"].unique()))
aggr.data_wire.head()

Unique wire combinations: 24

aggr.data_wire_time = from_db.data_wire_time.copy()

for col in wire_columns:
    aggr.data_wire_time[col] = pd.to_datetime(aggr.data_wire_time[col], errors="coerce")

aggr.data_wire_time.info()
aggr.data_wire_time.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3081 entries, 0 to 3080
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   key     3081 non-null   int64         
 1   wire_1  3055 non-null   datetime64[ns]
 2   wire_2  1079 non-null   datetime64[ns]
 3   wire_3  63 non-null     datetime64[ns]
 4   wire_4  14 non-null     datetime64[ns]
 5   wire_5  1 non-null      datetime64[ns]
 6   wire_6  73 non-null     datetime64[ns]
 7   wire_7  11 non-null     datetime64[ns]
 8   wire_8  19 non-null     datetime64[ns]
 9   wire_9  29 non-null     datetime64[ns]
dtypes: datetime64[ns](9), int64(1)
memory usage: 240.8 KB

wire_kw = {"axis": 1, "prefix": "wire_", "stop": 10}
aggr.data_wire_time["wires"] = aggr.data_wire_time.apply(encode_combinations, **wire_kw)
aggr.data_wire_time["t_pos_w"] = aggr.data_wire_time.apply(encode_positions, **wire_kw)
aggr.data_wire_time["t_delta_w"] = aggr.data_wire_time.apply(
    compute_time_delta, **wire_kw
)
print("Unique wire (time) combinations:", len(aggr.data_wire_time["wires"].unique()))
print(
    "Unique wire sequence sub-combinations:",
    len((aggr.data_wire_time["wires"] + aggr.data_wire_time["t_pos_w"]).unique())
)
XColumns.display_subsets(
    aggr.data_wire_time,
    subsets=[["key", "wires", "t_pos_w", "t_delta_w"], x_columns.wire],
    head=5,
)

Unique wire (time) combinations: 24
Unique wire sequence sub-combinations: 30

check_combinations(aggr.data_wire, aggr.data_wire_time, prefix="wire")

Total keys checked: 3081
Matching keys: 3081
Mismatching keys: 0

aggr.wire = aggr.data_wire.merge(
    aggr.data_wire_time.drop(columns=["wires"]),  # Mismatching keys: 0 -> drop
    on="key",
    suffixes=("", "t")  # To differentiate columns from both dataframes
)
XColumns.display_subsets(
    aggr.wire,
    subsets=[
        ["key", "wires", "t_pos_w", "t_delta_w"], x_columns.wire, x_columns.wire_t
    ],
    head=3,
)

plot_wire = Plot(
    aggr.wire.copy(), attribute=["t_delta_w", "seconds", "Time Delta (Wires)"]
)
plot_wire.df["t_delta_w"] = plot_wire.df["t_delta_w"].dt.total_seconds()
describe(t_delta_w=plot_wire.df)
plot_wire.histogram(100, range=(1, 7_000))

# Merge Arc with Temperature
# ==========================
aggr.combined = aggr.data_arc.reset_index().merge(
    aggr.data_temp.drop(columns=["max"]), on="key", how="inner"
)
aggr.combined.set_index("index", inplace=True)
aggr.combined.index.name = None
aggr.combined.head()


# Merge Gas
# =========
aggr.combined = aggr.combined.reset_index().merge(aggr.data_gas, on="key", how="inner")
aggr.combined.set_index("index", inplace=True)
aggr.combined.index.name = None


#  Merge Bulk
# ==========
aggr.combined = aggr.combined.reset_index().merge(aggr.bulk, on="key", how="inner")
aggr.combined.set_index("index", inplace=True)
aggr.combined.index.name = None


# Merge Wire
# ==========
aggr.combined = aggr.combined.reset_index().merge(aggr.wire, on="key", how="inner")
aggr.combined.set_index("index", inplace=True)
aggr.combined.index.name = None


print("\n", "TABLE DIMENSIONS:", len(aggr.combined), "x", len(aggr.combined.T))
print("\n", "COLUMNS")
chunk_size = 10
for i in range(0, len(aggr.combined.columns), chunk_size):
    print('  |_ ' + ', '.join(aggr.combined.columns[i : i + chunk_size]) + ',')

 TABLE DIMENSIONS: 2329 x 123

 COLUMNS
  |_ key, tx, t1, t2, t3, t4, t5, t6, t7, t8,
  |_ t9, t10, t11, t12, t13, t14, t15, t16, t17, t18,
  |_ t19, t20, t21, t22, t23, t24, t25, t26, t27, t28,
  |_ t29, t30, t31, t32, p1, p2, p3, p4, p5, p6,
  |_ p7, p8, p9, p10, p11, p12, p13, p14, p15, p16,
  |_ p17, p18, p19, p20, p21, p22, p23, p24, p25, p26,
  |_ p27, p28, p29, p30, p31, p32, input_temp, target, gas_1, bulk_1,
  |_ bulk_2, bulk_3, bulk_4, bulk_5, bulk_6, bulk_7, bulk_8, bulk_9, bulk_10, bulk_11,
  |_ bulk_12, bulk_13, bulk_14, bulk_15, bulks, bulk_1t, bulk_2t, bulk_3t, bulk_4t, bulk_5t,
  |_ bulk_6t, bulk_7t, bulk_8t, bulk_9t, bulk_10t, bulk_11t, bulk_12t, bulk_13t, bulk_14t, bulk_15t,
  |_ t_pos, t_delta, wire_1, wire_2, wire_3, wire_4, wire_5, wire_6, wire_7, wire_8,
  |_ wire_9, wires, wire_1t, wire_2t, wire_3t, wire_4t, wire_5t, wire_6t, wire_7t, wire_8t,
  |_ wire_9t, t_pos_w, t_delta_w,

bulk_t_columns = [f"bulk_{n}t" for n in range(1, 16)]

for bulk_t_col in bulk_t_columns:
    aggr.combined[bulk_t_col] = (
        (aggr.combined[bulk_t_col] - aggr.combined.index).dt.total_seconds()
    )
    aggr.combined[bulk_t_col].fillna(0, inplace=True)                       # bulk_{x}t
    aggr.combined[bulk_t_col] = aggr.combined[bulk_t_col].astype(int)
    aggr.combined[bulk_t_col.removesuffix("t")].fillna(0.0, inplace=True)   # bulk_{x}

aggr.combined[bulk_t_columns].info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2329 entries, 2019-05-03 11:02:14 to 2019-08-10 13:33:31
Data columns (total 15 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   bulk_1t   2329 non-null   int64
 1   bulk_2t   2329 non-null   int64
 2   bulk_3t   2329 non-null   int64
 3   bulk_4t   2329 non-null   int64
 4   bulk_5t   2329 non-null   int64
 5   bulk_6t   2329 non-null   int64
 6   bulk_7t   2329 non-null   int64
 7   bulk_8t   2329 non-null   int64
 8   bulk_9t   2329 non-null   int64
 9   bulk_10t  2329 non-null   int64
 10  bulk_11t  2329 non-null   int64
 11  bulk_12t  2329 non-null   int64
 12  bulk_13t  2329 non-null   int64
 13  bulk_14t  2329 non-null   int64
 14  bulk_15t  2329 non-null   int64
dtypes: int64(15)
memory usage: 291.1 KB

XColumns.display_subsets(
    aggr.combined,
    subsets=[
        x_columns.bulk[:10],
        x_columns.bulk[10:],
        x_columns.bulk_t[:10],
        x_columns.bulk_t[10:]],
    head=3,
)

wire_t_columns = [f"wire_{n}t" for n in range(1, 10)]

for wire_t_col in wire_t_columns:
    aggr.combined[wire_t_col] = (
        (aggr.combined[wire_t_col] - aggr.combined.index).dt.total_seconds()
    )
    aggr.combined[wire_t_col].fillna(0, inplace=True)                       # wire_{x}t
    aggr.combined[wire_t_col] = aggr.combined[wire_t_col].astype(int)
    aggr.combined[wire_t_col.removesuffix("t")].fillna(0.0, inplace=True)   # wire_{x}

aggr.combined[wire_t_columns].info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2329 entries, 2019-05-03 11:02:14 to 2019-08-10 13:33:31
Data columns (total 9 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   wire_1t  2329 non-null   int64
 1   wire_2t  2329 non-null   int64
 2   wire_3t  2329 non-null   int64
 3   wire_4t  2329 non-null   int64
 4   wire_5t  2329 non-null   int64
 5   wire_6t  2329 non-null   int64
 6   wire_7t  2329 non-null   int64
 7   wire_8t  2329 non-null   int64
 8   wire_9t  2329 non-null   int64
dtypes: int64(9)
memory usage: 182.0 KB

for index, row in aggr.combined[[*bulk_t_columns, *wire_t_columns]].iterrows():
    for col in [*bulk_t_columns, *wire_t_columns]:
        if row[col] < 0.0:
            raise ValueError("Additions before first timestamp")

aggr.combined["t_delta"] = aggr.combined["t_delta"].dt.total_seconds().astype(int)
aggr.combined["t_delta_w"] = aggr.combined["t_delta_w"].dt.total_seconds().astype(int)
display(aggr.combined.head().T.head(10).T)
print(". . .")
aggr.combined.head().T.tail(10).T

. . .

def correlation_mtx(df_: pd.DataFrame, *columns: list[str], title: str = "") -> None:
    # src: https://gist.github.com/eugen-hoppe/6009fc3bc061592dcd338d83371ec2a2
    phik_data = pd.get_dummies(
        df_[[*columns]],
        drop_first=True,
    )
    phik_matrix = phik_data[phik_data.columns].phik_matrix(
        interval_cols=phik_data.columns
    )
    plt.figure(figsize=(10, 8))
    sns.heatmap(
        phik_matrix, cmap="Blues", annot=True, fmt=".1f", annot_kws={"size": 6}
    )
    plt.title("Correlation Matrix" if not title else title, fontsize=14)
    plt.xticks(fontsize=8)
    plt.yticks(fontsize=8)
    plt.show()

correlation_mtx(
    aggr.combined,
    *xy.cols.base_float,
    *xy.cols.base_int,
    *xy.cols.active[:5],  # . Power
    *xy.cols.reactive[:5],
    *xy.cols.bulk,  # .       Quantity (Additions)
    *xy.cols.wire[:4],
    *xy.cols.wire[5:],
    "target",
    title="Correlation Matrix I",
)

correlation_mtx(
    aggr.combined,
    *xy.cols.base_float,
    *xy.cols.base_int,
    *xy.cols.arc[:5],  # .     Arc / Idle
    *xy.cols.no_arc[:5],
    *xy.cols.bulk_t, # .       Time (Additions)
    *xy.cols.wire_t[:4],
    *xy.cols.wire_t[5:],
    "target",
    title="Correlation Matrix II",
)

correlation_mtx(
    aggr.combined,
    *xy.cols.base_float,
    *xy.cols.base_int,
    *xy.cols.active[:5],  # .  Power
    *xy.cols.reactive[:5],
    *xy.cols.bulk_t, # .       Time (Additions)
    *xy.cols.wire_t[:4],
    *xy.cols.wire_t[5:],
    "target",
    title="Correlation Matrix III",
)

correlation_mtx(
    aggr.combined,
    *xy.cols.base_float,
    *xy.cols.base_int,
    *xy.cols.arc[:5],  # .     Arc / Idle
    *xy.cols.no_arc[:5],
    *xy.cols.bulk, # .         Quantity (Additions)
    *xy.cols.wire[:4],
    *xy.cols.wire[5:],
    "target",
    title="Correlation Matrix IV",
)

correlation_mtx(
    aggr.combined,
    "t_pos_w",
    "wires",
    "target",
    *xy.cols.base_float,
    *xy.cols.base_int,
    title="Correlation Matrix (Wires)",
)

def apparent_power(
        df_in: pd.DataFrame, active: list[str], reactive: list[str]
    ) -> pd.DataFrame:
    df_out = df_in.copy()
    for s_id, (active_col, reactive_col) in enumerate(zip(active, reactive)):
        df_out[f"s{s_id+1}"] = np.sqrt(df_out[active_col]**2 + df_out[reactive_col]**2)
    return df_out


def power_factor(
        df_in: pd.DataFrame, active: list[str], apparent: list[str]
    ) -> pd.DataFrame:
    df_out = df_in.copy()
    for s_id, (active_col, apparent_col) in enumerate(zip(active, apparent)):
        power_factor_col = f"pf{s_id+1}"
        df_out[power_factor_col] = df_out[active_col] / df_out[apparent_col]
        df_out[power_factor_col].fillna(0.0, inplace=True)
    return df_out

xy.synth = apparent_power(
    aggr.combined, active=xy.cols.active, reactive=xy.cols.reactive
)
xy.synth[xy.cols.apparent].head()

xy.synth = power_factor(
    xy.synth, active=xy.cols.active, apparent=xy.cols.apparent
)
xy.synth[xy.cols.power_x].head()

xy.synth["arc_sum"] = 0
for arc in xy.cols.arc:
    xy.synth["arc_sum"] += xy.synth[arc]

xy.synth["no_arc_sum"] = 0
for no_arc in xy.cols.no_arc:
    xy.synth["no_arc_sum"] += xy.synth[no_arc]

xy.synth["apparent_sum"] = 0
for apparent in xy.cols.apparent:
    xy.synth["apparent_sum"] += xy.synth[apparent]

xy.synth["power_x_sum"] = 0
for power_x in xy.cols.power_x:
    xy.synth["power_x_sum"] += xy.synth[power_x]


xy.synth[xy.cols.sums].head()

bulks_counts = xy.synth["bulks"].value_counts()
unique_bulks = bulks_counts[bulks_counts == 1].index
xy.synth["signature"] = xy.synth["bulks"]
xy.synth.loc[xy.synth["bulks"].isin(unique_bulks), "signature"] = (
    "W-" + xy.synth["wires"]
)
stratification_threshold = 8
signatures = []

for signature, amount in xy.synth["signature"].value_counts().to_dict().items():
    if amount < stratification_threshold:
        signatures.append(signature)

xy.synth.loc[xy.synth["signature"].isin(signatures), "signature"] = "no-group"
xy.synth["signature"].value_counts().tail().to_frame()

features = xy.synth.drop(columns=[*xy.y_cols, "signature", "key"])
target = xy.synth["target"]

xy.X_train, xy.X_test, xy.y_train, xy.y_test = train_test_split(
    features, target, test_size=0.25, random_state=SEED, stratify=xy.synth["signature"]
)

xy.X_train.shape, xy.X_test.shape, xy.y_train.shape, xy.y_test.shape

((1746, 157), (583, 157), (1746,), (583,))

class Encoder:
    def __init__(self, cat_cols):
        self.cat_cols = cat_cols
        self.one_hot_encoder = None
        self.feature_names = None
        self.is_fitted = False
        self.last_operation = "fit_transform"  # Track last operation mode

    def one_hot(self, df: pd.DataFrame, test: bool = False) -> pd.DataFrame:
        if test:
            if not self.is_fitted:
                raise ValueError("Encoder not fitted: Run with test=False first!")
            self.last_operation = "transform"
            encoded_array = self.one_hot_encoder.transform(df[self.cat_cols])
            encoded_df = pd.DataFrame(
                encoded_array, columns=self.feature_names, index=df.index
            )
        else:
            self.one_hot_encoder = OneHotEncoder(
                drop="first", handle_unknown="error", sparse_output=False
            )
            encoded_array = self.one_hot_encoder.fit_transform(df[self.cat_cols])
            self.feature_names = self.one_hot_encoder.get_feature_names_out(
                self.cat_cols
            )
            self.is_fitted = True
            self.last_operation = "fit_transform"
            encoded_df = pd.DataFrame(
                encoded_array, columns=self.feature_names, index=df.index
            )

        df_out = df.drop(columns=self.cat_cols)
        df_out = pd.concat([df_out, encoded_df], axis=1)

        return df_out

    def info(self) -> None:
        print(f"[Train/Test]: encoder.{self.last_operation}(x) is applied!")

    @staticmethod
    def replace_rare_classes(
        df_in: pd.DataFrame,
        cat_cols: list[str],
        min_frequency: int = 2,
        placeholder: str = "other",
    ) -> pd.DataFrame:
        df_out = df_in.copy()
        for col in cat_cols:
            category_counts = df_out[col].value_counts()
            rare_categories = (
                category_counts[category_counts < max(2, min_frequency)].index.tolist()
            )
            df_out[col] = df_out[col].apply(  # Replace rare categories with "other"
                lambda x: placeholder if x in rare_categories else x
            )
        return df_out

class CatTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, cat_cols: list[str], min_frequency: int = 10):
        self.cat_cols = cat_cols
        self.min_frequency = min_frequency
        self.encoder_ = Encoder(cat_cols=self.cat_cols)  

    def fit(self, X, y=None):
        X_transformed = Encoder.replace_rare_classes(
            X, cat_cols=self.cat_cols, min_frequency=self.min_frequency
        )
        self.encoder_.one_hot(X_transformed, test=False)
        return self

    def transform(self, X):
        X_transformed = Encoder.replace_rare_classes(
            X, cat_cols=self.cat_cols, min_frequency=self.min_frequency
        )
        return self.encoder_.one_hot(X_transformed, test=True)

@dataclass
class ML:
    frst: Xy = None
    catb: Xy = None
    nnet: Xy = None
    min_freq: int = 10

    @staticmethod
    def concat(x: pd.DataFrame, y: pd.Series) -> pd.DataFrame:
        return pd.concat([x, y.to_frame("target")], axis=1)


ml = ML()


for ml_model in ["frst", "catb", "nnet"]:
    setattr(
        ml,
        ml_model,
        Xy(
            cols=xy.cols,
            y_cols=["target"],
            synth=xy.synth.copy(),
            X_train=xy.X_train.copy(),
            y_train=xy.y_train.copy(),
            X_test=xy.X_test.copy(),
            y_test=xy.y_test.copy(),
        )
    )

def logging_callback(study: optuna.study.Study, trial: optuna.trial.Trial):
    print(f"[{trial.number}] Lowest MAE: {study.best_value:.4f}")

def plot_optimization_results(study: optuna.study.Study) -> None:
    trials: pd.DataFrame = study.trials_dataframe()
    trials = trials[trials.state == "COMPLETE"]  # Filter completed trials
    trials = trials.sort_values(by="number")
    best_values = np.minimum.accumulate(trials["value"])
    importances = optuna_importance.get_param_importances(study)  # Compute importances
    axes_: np.ndarray = plt.subplots(1, 2, figsize=(8, 3))[1]
    axes: tuple[type_ax.Axes, ...] = (axes_[0], axes_[1])
    axes[0].plot(  # Plot Optimization History (Left)
        trials["number"], trials["value"], "o", label="Objective", alpha=0.5
    )
    axes[0].plot(
        trials["number"],
        best_values, "-",
        label="Best",
        color="red"
    )
    axes[0].set_xlabel("Trial Number")
    axes[0].set_ylabel("Objective Value")
    axes[0].set_title("Optimization History")
    axes[0].legend()
    if importances:  # Plot Parameter Importances (Right)
        params, values = zip(
            *sorted(importances.items(), key=lambda x: x[1], reverse=True)
        )
        axes[1].barh(params, values, color="skyblue")
        axes[1].set_xlabel("Importance")
        axes[1].set_title("Parameter Importances")
        axes[1].invert_yaxis()
    else:
        no_param_txt = "No parameter importances available."
        axes[1].text(0.5, 0.5, no_param_txt, ha='center', va='center', fontsize=12)
        axes[1].set_frame_on(False)
    plt.tight_layout()
    plt.show()

def rf_objective(trial: optuna.trial.Trial) -> float:
    n_estimators = trial.suggest_int("n_estimators", 50, 500, step=50)
    max_depth = trial.suggest_int("max_depth", 5, 50, step=5)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)
    cat_transformer = CatTransformer(
        cat_cols=ml.frst.cols.base_str, 
        min_frequency=ml.min_freq
    )
    preprocessor = ColumnTransformer(
        transformers=[("replace_and_encode", cat_transformer, ml.frst.cols.base_str)],
        remainder="passthrough"
    )
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("model", RandomForestRegressor(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            random_state=42,
            n_jobs=-1
        ))
    ])
    cv = KFold(n_splits=5, shuffle=True, random_state=SEED)
    scores = cross_val_score(
        pipeline,
        ml.frst.X_train,   # [!] **raw**, unencoded training data
        ml.frst.y_train,
        cv=cv,
        scoring="neg_mean_absolute_error"
    )
    return -np.mean(scores)

rf_study = optuna.create_study(direction="minimize", sampler=optuna_sampler)
rf_study.optimize(rf_objective, n_trials=RF_TRIALS, callbacks=[logging_callback])

pd.DataFrame({"parameter": rf_study.best_params})

[0] Lowest MAE: 6.1764
[1] Lowest MAE: 6.1733
[2] Lowest MAE: 6.1733
[3] Lowest MAE: 6.1733
[4] Lowest MAE: 6.1733
[5] Lowest MAE: 6.1733
[6] Lowest MAE: 6.1733
[7] Lowest MAE: 6.1733
[8] Lowest MAE: 6.1567
[9] Lowest MAE: 6.1567
[10] Lowest MAE: 6.1567
[11] Lowest MAE: 6.1567
[12] Lowest MAE: 6.1567
[13] Lowest MAE: 6.1567
[14] Lowest MAE: 6.1567
[15] Lowest MAE: 6.1567
[16] Lowest MAE: 6.1567
[17] Lowest MAE: 6.1567
[18] Lowest MAE: 6.1549
[19] Lowest MAE: 6.1549

plot_optimization_results(rf_study)

def catboost_objective(trial: optuna.trial.Trial) -> float:
    params = {
        "iterations": trial.suggest_int("iterations", 100, 1500, step=50),
        "depth": trial.suggest_int("depth", 4, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, step=0.01),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0, 1),
        "border_count": trial.suggest_int("border_count", 32, 255),
        "verbose": 1 if DEV else 0,
        "random_seed": SEED
    }
    cat_transformer = CatTransformer(
        cat_cols=ml.catb.cols.base_str, min_frequency=ml.min_freq
    )
    preprocessor = ColumnTransformer(
        transformers=[
            ("cat", cat_transformer, ml.catb.cols.base_str)
        ],
        remainder="passthrough"  # numeric features stay as-is
    )
    pipeline = Pipeline([  # Build pipeline: preprocessor + CatBoost model
        ("preprocessor", preprocessor),
        ("model", CatBoostRegressor(**params))
    ])
    cv = KFold(n_splits=5, shuffle=True, random_state=SEED)  # Cross-validation
    scores = cross_val_score(
        pipeline,
        ml.catb.X_train,   # [!] **raw**, unencoded training data
        ml.catb.y_train,
        cv=cv,
        scoring="neg_mean_absolute_error"
    )
    return -np.mean(scores)

catb_study = optuna.create_study(direction="minimize", sampler=optuna_sampler)
catb_study.optimize(
   catboost_objective, n_trials=CATB_TRIALS, callbacks=[logging_callback]
)
pd.DataFrame({"parameter": catb_study.best_params})

[0] Lowest MAE: 5.9191
[1] Lowest MAE: 5.9191
[2] Lowest MAE: 5.9191
[3] Lowest MAE: 5.8966
[4] Lowest MAE: 5.8966
[5] Lowest MAE: 5.8966
[6] Lowest MAE: 5.8966
[7] Lowest MAE: 5.8966
[8] Lowest MAE: 5.8966
[9] Lowest MAE: 5.8966
[10] Lowest MAE: 5.8966
[11] Lowest MAE: 5.8966
[12] Lowest MAE: 5.8966
[13] Lowest MAE: 5.8761
[14] Lowest MAE: 5.8761
[15] Lowest MAE: 5.8761
[16] Lowest MAE: 5.8761
[17] Lowest MAE: 5.8154
[18] Lowest MAE: 5.8154
[19] Lowest MAE: 5.8154

plot_optimization_results(catb_study)

# Replace Rare Classes
# ====================
ml.nnet.X_train = Encoder.replace_rare_classes(
    ml.nnet.X_train, ml.nnet.cols.base_str, min_frequency=ml.min_freq
)
ml.nnet.X_test = Encoder.replace_rare_classes(
    ml.nnet.X_test, ml.nnet.cols.base_str, min_frequency=ml.min_freq
)


# Test / Validation Split
# =======================
train_val: tuple[pd.DataFrame, ...] = train_test_split(
    ml.nnet.X_train,
    ml.nnet.y_train,
    test_size=0.2,
    random_state=SEED,
    stratify=ml.nnet.X_train["bulks"]
)
X_train, X_val, y_train, y_val = train_val


X_train.shape, X_val.shape, y_train.shape, y_val.shape

((1396, 157), (350, 157), (1396,), (350,))

# Define Categorical Columns
# ==========================
cat_cols = ["wires", "bulks"]


# Exclude Columns
# ===============
exclude_cols = ["key", "target", "signature"] + [*cat_cols, "t_pos", "t_pos_w"]  # + cat
exclude_cols += xy.cols.apparent[4:] + xy.cols.power_x[4:]
exclude_cols += xy.cols.no_arc[4:]
exclude_cols += xy.cols.arc[4:]
exclude_cols += ["t_delta", "t_delta_w"]
corr_ = aggr.combined.drop(columns=xy.cols.base_str, errors="ignore").corr(
    method="spearman"
)

# Threshold for Correlated Columns
# ================================
threshold = 0.70  # __________ [!]
# --------------------------------
corr_cols_list = [xy.cols.base_float, xy.cols.arc, xy.cols.no_arc, xy.cols.active]
corr_cols_list += [xy.cols.reactive, xy.cols.bulk, xy.cols.bulk_t, xy.cols.wire]
corr_cols_list += [xy.cols.wire_t, xy.cols.apparent, xy.cols.power_x]

removed_features = set()
for corr_cols in corr_cols_list:
    for i, feature in enumerate(corr_cols):
        if feature in removed_features:
            continue  # Skip already removed features
        # Check correlation with other features in the same group
        correlated_features = [
            other for other in corr_cols[i+1:] 
            if corr_.abs().get(feature, pd.Series()).get(other, 0) > threshold
        ]
        # Remove only the second feature in each pair
        removed_features.update(correlated_features)

exclude_cols += list(removed_features)  # Add removed features to exclude_cols
print(removed_features)
    

# Nummeric Columns
# ================
num_cols = [c for c in ml.nnet.synth.columns if c not in exclude_cols]

len(num_cols), cat_cols

{'t29', 'p21', 't24', 't21', 'p29', 'p22', 't20', 'p30', 'p26', 'p25', 't25', 't28'}

(97, ['wires', 'bulks'])

label_encoders: dict[str, LabelEncoder] = {}


for col in cat_cols:
    if col in X_train.columns:
        le = LabelEncoder()

        # Fit & transform on train
        # ------------------------
        X_train[col] = le.fit_transform(X_train[col].astype(str))

        # Transform val/test using train encoder
        # --------------------------------------
        X_val[col] = le.transform(X_val[col].astype(str))
        ml.nnet.X_test[col] = le.transform(ml.nnet.X_test[col].astype(str))

        label_encoders[col] = le  # Save encoder for future use
    else:
        print(f"Warning: Column {col} not found in DataFrame!")


X_train.shape, X_val.shape, y_train.shape, y_val.shape

((1396, 157), (350, 157), (1396,), (350,))

scaler = StandardScaler()

X_train[num_cols] = scaler.fit_transform(X_train[num_cols])

X_val[num_cols] = scaler.transform(X_val[num_cols])
ml.nnet.X_test[num_cols] = scaler.transform(ml.nnet.X_test[num_cols])

class SteelDataset(Dataset):
    def __init__(
        self, dataframe: pd.DataFrame,
        feature_cols: list[str],
        cat_cols: list[str],
        input_temp_col: list[str],
        target_col: str
    ):
        self.data: pd.DataFrame = dataframe
        self.feature_cols = feature_cols  # . ____________ Numeric features
        self.cat_cols = cat_cols  # . ____________________ Categorical features
        self.input_temp_col = input_temp_col  # . ________ Skip Connection
        self.target_col = target_col

        self.X_num = self.data[self.feature_cols].values.astype(np.float32)
        self.X_cat = self.data[self.cat_cols].astype(np.int32).values
        self.input_temps = self.data[self.input_temp_col].values.astype(np.float32)
        self.y = self.data[self.target_col].values.astype(np.float32)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x_no_temp = self.X_num[idx]
        x_cat = self.X_cat[idx]
        input_temp = self.input_temps[idx]
        y = self.y[idx]

        return x_no_temp, x_cat, input_temp, y

class DeltaTemperature(nn.Module):
    def __init__(
        self,
        input_dim,
        hidden_dims=[256, 128, 64],
        embedding_sizes=None,
        is_sigmoid=False
    ):
        super().__init__()
        self.is_sigmoid = is_sigmoid
        if embedding_sizes is not None and len(embedding_sizes) > 0:
            self.embeddings = nn.ModuleList([
                nn.Embedding(num_emb, emb_dim) for (num_emb, emb_dim) in embedding_sizes
            ])
        else:
            self.embeddings = None
        total_emb_dim = 0
        if self.embeddings:
            total_emb_dim = sum(emb.embedding_dim for emb in self.embeddings)
        self.funnel_layers = nn.ModuleList()
        prev_dim = input_dim + total_emb_dim 
        for hd in hidden_dims:
            self.funnel_layers.append(nn.Linear(prev_dim, hd))
            if self.is_sigmoid:
                self.funnel_layers.append(nn.Sigmoid())
            else:
                self.funnel_layers.append(nn.Tanh())
            self.funnel_layers.append(nn.BatchNorm1d(hd))
            prev_dim = hd
        self.output_layer = nn.Linear(prev_dim, 1)

    def forward(self, x_no_temp, x_cat, input_temp):
        if self.embeddings:
            cat_embeds = []
            for i, emb in enumerate(self.embeddings):
                cat_embeds.append(emb(x_cat[:, i])) 
            cat_embeds = torch.cat(cat_embeds, dim=1)
            x = torch.cat([x_no_temp, cat_embeds], dim=1)
        else:  # purely numeric
            x = x_no_temp
        for layer in self.funnel_layers:
            x = layer(x)
        delta = self.output_layer(x).squeeze(dim=1)  # shape (batch_size,)
        final_temp = input_temp + delta  # .______________________ Skip Connection
        return final_temp

def train_model(
    model: DeltaTemperature,
    train_loader,
    val_loader,
    n_epochs=100,
    lr=1e-4,
    patience=20,
    device="cpu"
):
    model.to(device)
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)
    scheduler = ReduceLROnPlateau(
        optimizer, mode='min', factor=0.9, patience=3, min_lr=1e-7
    )
    criterion = nn.L1Loss()
    best_val_loss = float('inf')
    best_state_dict = None
    epochs_no_improve = 0

    for epoch in range(n_epochs):
        model.train()  # TRAINING
        running_train_loss = 0.0
        for x_no_temp, x_cat, input_temp, y in train_loader:
            x_no_temp  = x_no_temp.to(device)
            x_cat = x_cat.to(device)
            input_temp = input_temp.to(device)
            y = y.to(device)
            optimizer.zero_grad()
            preds = model(x_no_temp, x_cat, input_temp)
            loss = criterion(preds, y)
            loss.backward()
            optimizer.step()
            running_train_loss += loss.item() * x_no_temp.size(0)
        epoch_train_loss = running_train_loss / len(train_loader.dataset)

        model.eval()  # VALIDATION
        running_val_loss = 0.0
        with torch.no_grad():
            for x_no_temp, x_cat, input_temp, y in val_loader:
                x_no_temp  = x_no_temp.to(device)
                x_cat = x_cat.to(device)
                input_temp = input_temp.to(device)
                y = y.to(device)
                preds = model(x_no_temp, x_cat, input_temp)
                loss = criterion(preds, y)
                running_val_loss += loss.item() * x_no_temp.size(0)
        epoch_val_loss = running_val_loss / len(val_loader.dataset)
        scheduler.step(epoch_val_loss)   # Adjust learning rate if no improvement
        every_epoch = 30
        if epoch_val_loss > 100.0:
            every_epoch = 200
        if epoch % every_epoch == 0:
            for param_group in optimizer.param_groups:
                if DEV:
                    print(
                        f"Epoch [{epoch+1}/{n_epochs}] ", 
                        f"| Train MAE/MSE: {epoch_train_loss:.3f} ",
                        f"| Val MAE/MSE: {epoch_val_loss:.3f}",
                        f"| Current LR: {param_group['lr']:.8f}"
                    )
        if epoch_val_loss < best_val_loss:
            best_val_loss = epoch_val_loss
            best_state_dict = model.state_dict()
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= patience:  # EARLY STOPING
                break
    if best_state_dict is not None:
        model.load_state_dict(best_state_dict)
    return best_val_loss, model

feature_cols = num_cols + cat_cols  # numeric scaled + label-encoded categorical
input_temp_col = "input_temp"
target_col = "target"

dataset_columns = [feature_cols, cat_cols, input_temp_col, target_col]
train_dataset = SteelDataset(ML.concat(X_train, y_train), *dataset_columns)
val_dataset   = SteelDataset(ML.concat(X_val, y_val), *dataset_columns)
test_dataset  = SteelDataset(
    ML.concat(ml.nnet.X_test, ml.nnet.y_test), *dataset_columns
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
embedding_sizes=[
    (
        len(label_encoders[col].classes_) + 1,
        min(50, (len(label_encoders[col].classes_) // 2) + 1)
    ) for col in cat_cols if col in label_encoders
]
model_preview = DeltaTemperature(
    input_dim=len(feature_cols),
    hidden_dims=[256, 128, 64],
    embedding_sizes=embedding_sizes
).to(device)
summary(
    model_preview,
    input_data=(
        torch.randn(1, len(feature_cols)).to(device),  # x_no_temp
        torch.randint(0, 10, (1, len(cat_cols))).to(device),  # x_cat
        torch.randn(1).to(device),  # input_temp
    ),
    col_names=["input_size", "num_params"],
    depth=2,
    device=device
)

==========================================================================================
Layer (type:depth-idx)                   Input Shape               Param #
==========================================================================================
DeltaTemperature                         [1, 99]                   --
├─ModuleList: 1-1                        --                        --
│    └─Embedding: 2-1                    [1]                       32
│    └─Embedding: 2-2                    [1]                       435
├─ModuleList: 1-2                        --                        --
│    └─Linear: 2-3                       [1, 118]                  30,464
│    └─Tanh: 2-4                         [1, 256]                  --
│    └─BatchNorm1d: 2-5                  [1, 256]                  512
│    └─Linear: 2-6                       [1, 256]                  32,896
│    └─Tanh: 2-7                         [1, 128]                  --
│    └─BatchNorm1d: 2-8                  [1, 128]                  256
│    └─Linear: 2-9                       [1, 128]                  8,256
│    └─Tanh: 2-10                        [1, 64]                   --
│    └─BatchNorm1d: 2-11                 [1, 64]                   128
├─Linear: 1-3                            [1, 64]                   65
==========================================================================================
Total params: 73,044
Trainable params: 73,044
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 0.07
==========================================================================================
Input size (MB): 0.00
Forward/backward pass size (MB): 0.01
Params size (MB): 0.29
Estimated Total Size (MB): 0.30
==========================================================================================

def nn_objective(trial: optuna.trial.Trial):
    hidden_dim_1 = trial.suggest_categorical(
        "hidden_dim_1", [280, 256, 240, 232, 200, 192]
    )
    hidden_dim_2 = trial.suggest_categorical("hidden_dim_2", [128, 120, 104, 96])
    hidden_dim_3 = trial.suggest_categorical("hidden_dim_3", [64, 32])
    is_sigmoid = trial.suggest_categorical("is_sigmoid", [True, False])
    batch_size = trial.suggest_categorical("batch_size", [8, 12, 16])
    lr = trial.suggest_categorical("lr", [9e-4, 7e-4, 5e-4, 3e-4, 1e-4, 9e-5, 7e-5])
    model = DeltaTemperature(
        input_dim=len(feature_cols),
        hidden_dims=[hidden_dim_1, hidden_dim_2, hidden_dim_3],
        is_sigmoid=is_sigmoid
    ).to(device)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    val_loss, _ = train_model(  # Train Model
        model,
        train_loader,
        val_loader,
        n_epochs=2_000,
        lr=lr,
        patience=50,
        device=device
    )
    return val_loss

nn_study = optuna.create_study(direction="minimize", sampler=optuna_sampler)
nn_study.optimize(nn_objective, n_trials=NN_TRIALS, callbacks=[logging_callback])
pd.DataFrame({"parameter": nn_study.best_params})

[0] Lowest MAE: 6.2347
[1] Lowest MAE: 6.2347
[2] Lowest MAE: 6.2347
[3] Lowest MAE: 6.2347
[4] Lowest MAE: 6.2212
[5] Lowest MAE: 6.2212
[6] Lowest MAE: 6.2212
[7] Lowest MAE: 6.2212
[8] Lowest MAE: 6.2212
[9] Lowest MAE: 6.1604
[10] Lowest MAE: 6.1604
[11] Lowest MAE: 6.1604
[12] Lowest MAE: 6.1604
[13] Lowest MAE: 6.1604
[14] Lowest MAE: 6.1604
[15] Lowest MAE: 6.1604
[16] Lowest MAE: 6.1604
[17] Lowest MAE: 6.1604
[18] Lowest MAE: 6.1604
[19] Lowest MAE: 6.1604

plot_optimization_results(nn_study)

catb_X_train = Encoder.replace_rare_classes(
    ml.catb.X_train, cat_cols=ml.catb.cols.base_str, min_frequency=ml.min_freq
)
encoder = Encoder(ml.catb.cols.base_str)
catb_X_train = encoder.one_hot(catb_X_train, test=False)
encoder.info()

model = CatBoostRegressor(
    **catb_study.best_params,
    random_seed=SEED,
    verbose=False,
)
model.fit(catb_X_train, ml.catb.y_train)
feature_importance_df = pd.DataFrame({
    "Feature": catb_X_train.columns, "Importance": model.get_feature_importance()
})
feature_importance_df_top = feature_importance_df.sort_values(
    by="Importance", ascending=False
)
top = 20
plt.figure(figsize=(8, 6))
plt.barh(
    feature_importance_df_top[:top]["Feature"],
    feature_importance_df_top[:top]["Importance"],
    color="grey"
)
plt.xlabel("Feature Importance")
plt.ylabel("Feature Name")
plt.title("CatBoost Feature Importance")
plt.gca().invert_yaxis()
plt.show()

[Train/Test]: encoder.fit_transform(x) is applied!

explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(catb_X_train)
shap.summary_plot(shap_values, catb_X_train, show=False)
plt.gcf().set_size_inches(8, 6)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.show()

plt.figure(figsize=(8, 5))
df_scatter = ml.concat(ml.catb.X_train, ml.catb.y_train)
sns.scatterplot(
    data=df_scatter, x="input_temp", y="target", alpha=0.3, label="Data Points"
)
sns.regplot(
    data=df_scatter, x="input_temp", y="target",
    scatter=False, color="black", label="Trend Line"
)
plt.title("Relationship between Input Temperature and Target")
plt.xlabel("Input Temperature")
plt.ylabel("Target Temperature")
plt.legend()
plt.show()

ml.catb.X_test = Encoder.replace_rare_classes(
    ml.catb.X_test, cat_cols=ml.catb.cols.base_str, min_frequency=ml.min_freq
)
ml.catb.X_test = encoder.one_hot(ml.catb.X_test, test=True)
encoder.info()

y_pred = model.predict(ml.catb.X_test, verbose=False)
mae = mean_absolute_error(ml.catb.y_test, y_pred)
print("Test set MAE:", mae)

[Train/Test]: encoder.transform(x) is applied!
Test set MAE: 6.174001690294176

catboost_r2 = r2_score(ml.catb.y_test, y_pred)
print(f"CatBoost R²: {catboost_r2:.3f}")

naive_prediction = np.mean(ml.catb.y_train)
naive_mae = np.mean(np.abs(ml.catb.y_test - naive_prediction))
models = [f"Naive Model (MAE {naive_mae:.2f})", f"CatBoost (MAE {mae:.2f})"]
mae_scores = [naive_mae, mae]

plt.figure(figsize=(8, 2))
plt.bar(models, mae_scores, color=["gray", "#333"])
plt.xlabel("Model Type")
plt.ylabel("Mean Absolute Error (MAE)")
plt.title("Comparison: Naive Model vs. CatBoost")
plt.show()

CatBoost R²: 0.493

num_samples = 30  
observations = np.arange(1, num_samples + 1)
actuals = ml.catb.y_test[:num_samples].to_numpy()
true_label, pred_label = "Test Temperature", "Predicted Temperature"
shift_value = 1550  # ._____________________________________________ Shift y-axis scale

plt.figure(figsize=(8, 5))
plt.bar(observations, actuals, width=0.6, label=true_label, color="#333", alpha=0.5)
plt.bar(observations, y_pred[:num_samples], width=0.2, label=pred_label, color="#222")

plt.xlabel("Observation")
plt.ylabel("Temperature")
plt.title("Test vs. Predicted Temperatures (CatBoost Model)")
plt.xticks(observations)
y_ticks = np.arange(shift_value, shift_value + 100, 50)
plt.yticks(y_ticks, labels=(y_ticks).astype(int))
plt.ylim(shift_value, shift_value + 100)
plt.legend()
plt.show()

def bootstrap_catboost(
    model: CatBoostRegressor,
    X_test: pd.DataFrame,
    y_test: pd.DataFrame,
    num_bootstraps: int = 1000,
    ci = 95
) -> tuple[float, float, float, np.ndarray]:
    n_ = X_test.shape[0]
    mae_scores = []
    for _ in range(num_bootstraps):
        sample_indices = np.random.choice(n_, n_, replace=True)
        X_bs = X_test.iloc[sample_indices]
        y_bs = y_test.iloc[sample_indices]
        preds = model.predict(X_bs)
        sample_mae = mean_absolute_error(y_bs, preds)
        mae_scores.append(sample_mae)
    mae_scores = np.array(mae_scores)
    alpha = 100 - ci
    mae_lower = np.percentile(mae_scores, alpha / 2).item()
    mae_upper = np.percentile(mae_scores, 100 - alpha / 2).item()
    mae_mean = float(mae_scores.mean())

    return mae_lower, mae_upper, mae_mean, mae_scores

mae_lower, mae_upper, mae_mean, mae_scores = bootstrap_catboost(
    model=model,
    X_test=ml.catb.X_test,
    y_test=ml.catb.y_test,
    num_bootstraps=2_000,
    ci=95
)
threshold = 6.8
prob_mae_below_threshold = round((mae_scores < threshold).mean() * 100, 2)

plt.figure(figsize=(8, 5))
plt.hist(mae_scores, bins=100, alpha=0.7, color='grey', edgecolor='#333')
plt.axvline(mae_lower, linestyle='--', label=f'CI Lower = {mae_lower:.3f}')
plt.axvline(mae_upper, color='red', linestyle='--', label=f'CI Upper = {mae_upper:.3f}')
plt.axvline(
    mae_mean, color='blue',
    linestyle='-',
    linewidth=8,
    alpha=0.3,
    label=f'Mean = {mae_mean:.3f}'
)
plt.axvline(
    threshold,
    color='green',
    linestyle='-',
    linewidth=8,
    alpha=0.3,
    label=f'Probability (MAE<6.8) = {prob_mae_below_threshold} %'
)

plt.title("Bootstrap MAE Distribution (CatBoost)")
plt.xlabel("MAE")
plt.ylabel("Frequency")
plt.legend()
plt.show()

Table Name	Description
`steel.data_arc`	Data about the electrodes
`steel.data_bulk`	Data about the amount of bulk materials
`steel.data_bulk_time`	Data about the timing of bulk material delivery
`steel.data_gas`	Data about the purging of the alloy with gas
`steel.data_temp`	Data about temperature measurements
`steel.data_wire`	Data about the amount of wire materials
`steel.data_wire_time`	Data about the timing of wire material delivery

Column Name	Description
`key`	Batch number
`BeginHeat`	Start time of heating
`EndHeat`	End time of heating
`ActivePower`	The value of active power
`ReactivePower`	The value of reactive power

Column Name	Description
`key`	Batch number
`gas`	Volume of the gas delivered

Column Name	Description
`key`	Batch number
`MeasureTime`	Time of measurement
`Temperature`	Temperature value

Column Name	Description
`key`	Batch number
`Bulk1 ... Bulk15`	Volume of the added material

Column Name	Description
`key`	Batch number
`Wire1 ... Wire9`	Volume of the wire materials delivered

	key	begin_heat	end_heat	active_power	reactive_power
0	1	2019-05-03 11:02:14	2019-05-03 11:06:02	0.305130	0.211253
1	1	2019-05-03 11:07:28	2019-05-03 11:10:33	0.765658	0.477438
2	1	2019-05-03 11:11:44	2019-05-03 11:14:36	0.580313	0.430460
3	1	2019-05-03 11:18:14	2019-05-03 11:24:19	0.518496	0.379979
4	1	2019-05-03 11:26:09	2019-05-03 11:28:37	0.867133	0.643691

	key	step	meausure_time	temperature
0	1	1	2019-05-03 11:02:04	1571.0
1	1	2	2019-05-03 11:07:18	1604.0
2	1	3	2019-05-03 11:11:34	1618.0
3	1	4	2019-05-03 11:18:04	1601.0
4	1	5	2019-05-03 11:25:59	1606.0
5	1	6	2019-05-03 11:30:38	1613.0
6	2	1	2019-05-03 11:34:04	1581.0
7	2	2	2019-05-03 11:38:40	1577.0

	key	bulk_1	bulk_2	bulk_3	bulk_4	bulk_5	bulk_6	bulk_7	bulk_8	bulk_9	bulk_10	bulk_11	bulk_12	bulk_13	bulk_14	bulk_15
0	1	NaN	NaN	NaN	43.0	None	None	None	None	None	None	None	206.0	None	150.0	154.0
1	2	NaN	NaN	NaN	73.0	None	None	None	None	None	None	None	206.0	None	149.0	154.0
2	3	NaN	NaN	NaN	34.0	None	None	None	None	None	None	None	205.0	None	152.0	153.0
3	4	NaN	NaN	NaN	81.0	None	None	None	None	None	None	None	207.0	None	153.0	154.0
4	5	NaN	NaN	NaN	78.0	None	None	None	None	None	None	None	203.0	None	151.0	152.0

	key	wire_1	wire_2	wire_3	wire_4	wire_5	wire_6	wire_7	wire_8	wire_9
0	1	60.059998	None	None	None	None	None	None	None	None
1	2	96.052315	None	None	None	None	None	None	None	None
2	3	91.160157	None	None	None	None	None	None	None	None
3	4	89.063515	None	None	None	None	None	None	None	None
4	5	89.238236	9.11456	None	None	None	None	None	None	None

	key	wire_1	wire_2	wire_3	wire_4	wire_5	wire_6	wire_7	wire_8	wire_9
0	1	2019-05-03 11:06:19	None	None	None	None	None	None	None	None
1	2	2019-05-03 11:36:50	None	None	None	None	None	None	None	None
2	3	2019-05-03 12:11:46	None	None	None	None	None	None	None	None
3	4	2019-05-03 12:43:22	None	None	None	None	None	None	None	None
4	5	2019-05-03 13:20:44	2019-05-03 13:15:34	None	None	None	None	None	None	None

	active_power
count	14876.000000
mean	0.662752
std	0.258885
min	0.223120
25%	0.467115
50%	0.599587
75%	0.830070
max	1.463773

	reactive_power
count	14875.000000
mean	0.487115
std	0.197612
min	0.153777
25%	0.337190
50%	0.441665
75%	0.608223
max	1.270284

	t18	t19	t20	t21	t22	t23	t24	t25	t26	t27	t28	t29	t30	t31	t32
0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
2	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
3	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
4	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0

	key	step	max	active_power	reactive_power	p1	p2	p3	p4	p5
0	1.0	1.0	5.0	0.305130	0.211253	0.30513	-0.211253	0.000000	0.000000	0.000000
1	1.0	2.0	5.0	0.765658	0.477438	0.00000	0.000000	0.765658	-0.477438	0.000000
2	1.0	3.0	5.0	0.580313	0.430460	0.00000	0.000000	0.000000	0.000000	0.580313
3	1.0	4.0	5.0	0.518496	0.379979	0.00000	0.000000	0.000000	0.000000	0.000000
4	1.0	5.0	5.0	0.867133	0.643691	0.00000	0.000000	0.000000	0.000000	0.000000

	key	gas_1
0	1	29.749986
1	2	12.555561
2	3	28.554793
3	4	18.841219
4	5	5.413692

	t1	t3	t5	t7	t9
0	228	0	0	0	0
1	0	185	0	0	0
2	0	0	172	0	0
3	0	0	0	365	0
4	0	0	0	0	148

	t1	t3	t5	t7	t9	t11	t13	t15	t17	t19	t21	t23	t25	t27	t29	t31
2019-05-03 11:02:14	228	185	172	365	148	0	0	0	0	0	0	0	0	0	0	0
2019-05-03 11:34:14	137	338	126	210	0	0	0	0	0	0	0	0	0	0	0	0

	t2	t4	t6	t8	t10	t12	t14	t16	t18	t20	t22	t24	t26	t28	t30	t32
2019-05-03 11:02:14	-86	-71	-218	-110	0	0	0	0	0	0	0	0	0	0	0	0
2019-05-03 11:34:14	-139	-111	-83	0	0	0	0	0	0	0	0	0	0	0	0	0

	t18	t19	t20	t21	t22	t23	t24	t25	t26	t27	t28	t29	t30	t31	t32
0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
2	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
3	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
4	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0

	p18	p19	p20	p21	p22	p23	p24	p25	p26	p27	p28	p29	p30	p31	p32
0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

	t1	t3	t5	t7	t9
0	228	0	0	0	0
1	0	185	0	0	0
2	0	0	172	0	0
3	0	0	0	365	0
4	0	0	0	0	148

	t1	t3	t5	t7	t9
2019-05-03 11:02:14	228	185	172	365	148
2019-05-03 11:34:14	137	338	126	210	0
2019-05-03 12:06:54	280	124	109	77	65
2019-05-03 12:39:37	207	99	157	278	0
2019-05-03 13:11:13	251	132	415	71	0

	key	step	meausure_time	temperature	max
13921	2499	1	2019-08-10 13:33:21	1569.0	5
13922	2499	2	2019-08-10 13:41:34	1604.0	5
13923	2499	3	2019-08-10 13:46:28	1593.0	5
13924	2499	4	2019-08-10 13:54:56	1588.0	5
13925	2499	5	2019-08-10 13:58:58	1603.0	5

	gas_1
count	3239.000000
mean	11.002062
std	6.220327
min	0.008399
25%	7.043089
50%	9.836267
75%	13.769915
max	77.995040

	key	bulk_1	bulk_2	bulk_3	bulk_4	bulk_5	bulk_6	bulk_7	bulk_8
0	1	NaT	NaT	NaT	2019-05-03 11:28:48	NaT	NaT	NaT	NaT
1	2	NaT	NaT	NaT	2019-05-03 11:36:50	NaT	NaT	NaT	NaT
2	3	NaT	NaT	NaT	2019-05-03 12:32:39	NaT	NaT	NaT	NaT
3	4	NaT	NaT	NaT	2019-05-03 12:43:22	NaT	NaT	NaT	NaT
4	5	NaT	NaT	NaT	2019-05-03 13:30:47	NaT	NaT	NaT	NaT

	bulk_9	bulk_10	bulk_11	bulk_12	bulk_13	bulk_14	bulk_15
0	NaT	NaT	NaT	2019-05-03 11:24:31	NaT	2019-05-03 11:14:50	2019-05-03 11:10:43
1	NaT	NaT	NaT	2019-05-03 11:53:30	NaT	2019-05-03 11:48:37	2019-05-03 11:44:39
2	NaT	NaT	NaT	2019-05-03 12:27:13	NaT	2019-05-03 12:21:01	2019-05-03 12:16:16
3	NaT	NaT	NaT	2019-05-03 12:58:00	NaT	2019-05-03 12:51:11	2019-05-03 12:46:36
4	NaT	NaT	NaT	2019-05-03 13:30:47	NaT	2019-05-03 13:34:12	2019-05-03 13:30:47

	key	bulks	t_pos	t_delta
0	1	4-12-14-15	4-3-2-1	0 days 00:18:05
1	2	4-12-14-15	1-4-3-2	0 days 00:16:40
2	3	4-12-14-15	4-3-2-1	0 days 00:16:23
3	4	4-12-14-15	1-4-3-2	0 days 00:14:38
4	5	4-12-14-15	1-1-2-1	0 days 00:03:25

	key	wires	t_pos_w	t_delta_w
0	1	1	1	0 days 00:00:00
1	2	1	1	0 days 00:00:00
2	3	1	1	0 days 00:00:00
3	4	1	1	0 days 00:00:00
4	5	1-2	2-1	0 days 00:05:10

Data-Driven Steel Production Optimization¶

Project Setup (Step I)¶

Dependencies¶

Constants¶

Database Schema¶

Production Process Related Tables¶

Impurity Tables¶

Database Configuration¶

Global Variables¶

Data Overview¶

Arc¶

Gas¶

Temperature¶

Bulk¶

Wire¶

Data Frame Views¶

Preprocessing and EDA (Step II)¶

Arc¶

Time (Arc)¶

Heat Time¶

Idle Time¶

Power (Arc)¶

Active Power¶

Reactive Power¶

Transformation¶

Event (Transformation)¶

Power (Transformation)¶

Aggregation (Arc)¶

Temperature¶

Transformation (Temperature)¶

Aggregation (Temperature)¶

Gas¶

Additions¶

Bulk (Quantity)¶

Bulk (Time)¶

Bulk (Feature Engineering)¶

Bulk (Merging)¶

Wire (Quantity)¶

Wire (Time)¶

Wire (Feature Engineering)¶

Wire (Merging)¶

Data Merging¶

Datatype Processing (Additions)¶

Bulk (Datatypes)¶

Wire (Datatypes)¶

Datatypes (Additions)¶

Correlation Analysis¶

Summary (Correlation)¶

Feature Engineering¶

Apparent Power¶

Power Factor¶

Explanation (Power)¶

Feature Sums¶

Summary (EDA)¶

Training (Step III)¶

Data Preparation for Models¶

Data Split¶

Encoder¶

Datasets (Models)¶

Train Models¶

Random Forest¶

Cat Boost¶

Neural Network¶

Validation Split¶

Column Selection¶

Label Encoder¶

Standard Scaler¶

Dataset¶

Model (NN)¶

Training Function¶

Model Architecture¶

Hyperparameter¶

Summary (Training)¶

Evaluation (Step IV)¶

Feature Importance¶

Test¶

Baseline Model¶

Comparison Chart¶

Bootstrap Test¶

Conclusion (Step V)¶

	t18	t19	t20	t21	t22	t23	t24	t25	t26	t27	t28	t29	t30	t31	t32
0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
2	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
3	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
4	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0

	p18	p19	p20	p21	p22	p23	p24	p25	p26	p27	p28	p29	p30	p31	p32
0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

	t1	t3	t5	t7	t9
0	228	0	0	0	0
1	0	185	0	0	0
2	0	0	172	0	0
3	0	0	0	365	0
4	0	0	0	0	148

	bulk_1t	bulk_2t	bulk_3t	bulk_4t	bulk_5t	bulk_6t	bulk_7t	bulk_8t	bulk_9t	bulk_10t
2019-05-03 11:02:14	0	0	0	1594	0	0	0	0	0	0
2019-05-03 11:34:14	0	0	0	156	0	0	0	0	0	0
2019-05-03 12:06:54	0	0	0	1545	0	0	0	0	0	0

	bulk_12t	bulk_14t	bulk_15t
2019-05-03 11:02:14	1337	756	509
2019-05-03 11:34:14	1156	863	625
2019-05-03 12:06:54	1219	847	562

	s1	s2	s3	s4	s5
2019-05-03 11:02:14	0.371123	0.902319	0.722536	0.642824	1.079934
2019-05-03 11:34:14	0.440239	0.332731	0.860064	0.955315	0.000000
2019-05-03 12:06:54	1.373863	0.720979	0.925082	1.497189	0.502111
2019-05-03 12:39:37	0.824445	0.393685	1.097105	1.084803	0.000000
2019-05-03 13:11:13	0.428064	0.722735	0.555308	1.110873	0.000000

	pf1	pf2	pf3	pf4	pf5
2019-05-03 11:02:14	0.822181	0.848545	0.803161	0.806591	0.802950
2019-05-03 11:34:14	0.865721	0.786416	0.825865	0.823102	0.000000
2019-05-03 12:06:54	0.801884	0.752692	0.837366	0.816839	0.839841
2019-05-03 12:39:37	0.779854	0.789191	0.809271	0.797375	0.000000
2019-05-03 13:11:13	0.758211	0.793943	0.831286	0.803834	0.000000

	arc_sum	no_arc_sum	apparent_sum	power_x_sum
2019-05-03 11:02:14	1098	-485	3.718736	4.083428
2019-05-03 11:34:14	811	-333	2.588349	3.301105
2019-05-03 12:06:54	655	-870	5.019223	4.048621
2019-05-03 12:39:37	741	-352	3.400038	3.175691
2019-05-03 13:11:13	869	-493	2.816980	3.187274

	parameter
max_depth	15
min_samples_leaf	5
min_samples_split	6
n_estimators	250

	parameter
bagging_temperature	0.998874
border_count	247.000000
depth	6.000000
iterations	650.000000
learning_rate	0.040000