# -*- coding: utf-8 -*-
"""
Created on Sat Oct  4 09:43:51 2025

@author: Moritz Romeike
"""

# --------------------------------------------------------------------------------
# Programmcode 33 (Python, ohne statsmodels): Robuste lineare Regression
# Abhängigkeiten: pandas, numpy, matplotlib
# --------------------------------------------------------------------------------
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

# ===========================
# 1) Daten laden & vorbereiten
# ===========================
csv_path = "data_robust_linear_regression.csv"
df = pd.read_csv(csv_path, sep=';', dtype=str, encoding='utf-8', engine='python')

# Spalten trimmen und Dezimalkomma konvertieren
df.columns = df.columns.str.strip()
for c in df.columns:
    df[c] = df[c].astype(str).str.strip()

def to_float(s: pd.Series) -> pd.Series:
    return pd.to_numeric(
        s.str.replace(".", "", regex=False).str.replace(",", ".", regex=False),
        errors="coerce"
    )

# Erwartete Spalten: x, y; optionale Indexspalte X
for col in ("x", "y"):
    if col not in df.columns:
        raise KeyError(f"Spalte '{col}' fehlt. Vorhanden: {list(df.columns)}")
    df[col] = to_float(df[col])

if "X" not in df.columns:
    df["X"] = np.arange(1, len(df) + 1)

x = df["x"].to_numpy(float)
y = df["y"].to_numpy(float)
n = len(y)

# ===========================
# 2) Hilfsfunktionen
# ===========================
def ols_fit(x, y):
    """OLS per Hand: beta, fitted, resid, sigma, SE(beta)"""
    n = len(y)
    X = np.column_stack([np.ones(n), x])
    XtX = X.T @ X
    beta = np.linalg.inv(XtX) @ (X.T @ y)
    yhat = X @ beta
    resid = y - yhat
    p = 2
    s2 = np.sum(resid**2) / (n - p)
    se_beta = np.sqrt(np.diag(s2 * np.linalg.inv(XtX)))
    return beta, yhat, resid, np.sqrt(s2), se_beta

def mad(arr):
    """Median Absolute Deviation (robuste Skale)."""
    med = np.median(arr)
    return np.median(np.abs(arr - med))

def huber_weights(r, s, k=1.345):
    """Huber-Gewichte: w = 1, wenn |r/s|<=k, sonst w = k/|r/s|."""
    z = r / (s + 1e-12)
    w = np.ones_like(z)
    mask = np.abs(z) > k
    w[mask] = (k / (np.abs(z[mask]) + 1e-12))
    return w

def tukey_biweight_weights(r, s, c=4.685):
    """Tukey's Biweight (MM-Schritt)."""
    z = r / (s + 1e-12)
    w = np.zeros_like(z)
    m = np.abs(z) < c
    z2 = z[m] / c
    w[m] = (1 - z2**2)**2
    return w

def irls(x, y, w_func, init_beta=None, max_iter=100, tol=1e-6, scale_from="mad", **kwargs):
    """
    Iteratively Reweighted Least Squares.
    w_func(r, s, **kwargs) -> weights (0..1)
    scale_from: "mad" oder "sigma_ols"
    """
    n = len(y)
    X = np.column_stack([np.ones(n), x])

    # Start mit OLS
    if init_beta is None:
        beta, _, resid, sigma, _ = ols_fit(x, y)
    else:
        beta = init_beta
        resid = y - X @ beta
        sigma = np.sqrt(np.sum(resid**2) / (n - 2))

    # Anfangsskale robust
    if scale_from == "mad":
        s = 1.4826 * mad(resid) if mad(resid) > 0 else sigma
    else:
        s = sigma

    for _ in range(max_iter):
        r = y - X @ beta
        w = w_func(r, s, **kwargs)
        # Gew. Normalgleichungen
        W = np.diag(w)
        XtW = X.T @ W
        beta_new = np.linalg.pinv(XtW @ X) @ (XtW @ y)

        # neue Skale (robust)
        r_new = y - X @ beta_new
        s_new = 1.4826 * mad(r_new) if mad(r_new) > 0 else np.sqrt(np.sum((w * r_new**2)) / max(n - 2, 1))

        if np.linalg.norm(beta_new - beta) < tol * (np.linalg.norm(beta) + 1e-12):
            beta = beta_new
            s = s_new
            resid = r_new
            break

        beta = beta_new
        s = s_new
        resid = r_new

    # Fitted & einfache SE(beta) (annähernd mit w als Gewichte)
    yhat = X @ beta
    # geschätzte Varianz via gewichtete SSE
    s2 = np.sum((w * (y - yhat)**2)) / max(n - 2, 1)
    XtWX_inv = np.linalg.pinv(XtW @ X)
    se_beta = np.sqrt(np.diag(s2 * XtWX_inv))
    return beta, yhat, resid, np.sqrt(s2), se_beta, w

# ===========================
# 3) Fits: KQ, KQ ohne 8, KQ ohne 8 & 11
# ===========================
beta_kq, yhat_kq, resid_kq, sigma_kq, se_kq = ols_fit(x, y)

def drop_1based(arr, idx):
    mask = np.ones(len(arr), dtype=bool)
    for i in idx:
        j = i - 1
        if 0 <= j < len(arr):
            mask[j] = False
    return mask

mask_ohne8 = drop_1based(x, [8])
beta_kq_8, _, _, _, _ = ols_fit(x[mask_ohne8], y[mask_ohne8])

mask_ohne8_11 = drop_1based(x, [8, 11])
beta_kq_8_11, _, _, _, _ = ols_fit(x[mask_ohne8_11], y[mask_ohne8_11])

# ===========================
# 4) Robuste Fits: Huber-M & MM
# ===========================
# Huber-M (IRLS mit Huber-Gewichten)
beta_huber, yhat_huber, resid_huber, sigma_huber, se_huber, w_huber = irls(
    x, y, w_func=huber_weights, k=1.345, scale_from="mad"
)

# MM: Start mit Huber, dann Tukey-Biweight Reweighting
beta_mm, yhat_mm, resid_mm, sigma_mm, se_mm, w_mm = irls(
    x, y, w_func=tukey_biweight_weights, init_beta=beta_huber, c=4.685, scale_from="mad"
)

# ===========================
# 5) Ausgabe (ähnlich R)
# ===========================
def print_fit(name, beta, se, resid):
    b0, b1 = beta
    se0, se1 = se
    res_se = np.sqrt(np.sum(resid**2) / max(len(resid) - 2, 1))
    print(f"\n=== {name} ===")
    print(f"(Intercept)  {b0:10.3f}   SE {se0:8.3f}")
    print(f"x            {b1:10.3f}   SE {se1:8.3f}")
    print(f"Residual SE: {res_se:.3f}")

print_fit("KQ-Methode (OLS)", beta_kq, se_kq, resid_kq)
print_fit("KQ-Methode (-8)", beta_kq_8, np.array([np.nan, np.nan]), y[mask_ohne8] - (np.column_stack([np.ones(mask_ohne8.sum()), x[mask_ohne8]]) @ beta_kq_8))
print_fit("KQ-Methode (-8 & 11)", beta_kq_8_11, np.array([np.nan, np.nan]), y[mask_ohne8_11] - (np.column_stack([np.ones(mask_ohne8_11.sum()), x[mask_ohne8_11]]) @ beta_kq_8_11))
print_fit("Huber-M", beta_huber, se_huber, resid_huber)
print_fit("MM-Methode", beta_mm, se_mm, resid_mm)

# ===========================
# 6) Plot: Daten + 4 Linien
# ===========================
plt.figure(figsize=(8, 5))
plt.scatter(x, y, s=25, alpha=0.8, label="Daten")

# Feste Achsengrenzen für konsistente Darstellung
x_left, x_right = 0.7, 1.30
y_bottom, y_top = 65000, 100000
plt.xlim(x_left, x_right)
plt.ylim(y_bottom, y_top)

# Regressionslinien über den gesamten Plotbereich
xline = np.linspace(x_left, x_right, 500)

plt.plot(xline, beta_kq[0]    + beta_kq[1] * xline,
         color="black", lw=2, label="KQ-Methode")
plt.plot(xline, beta_huber[0] + beta_huber[1] * xline,
         color="black", lw=2, ls="--", label="Huber-M-Methode")
plt.plot(xline, beta_mm[0]    + beta_mm[1] * xline,
         color="black", lw=2, ls=(0, (5, 2, 1, 2)), label="MM-Methode")
plt.plot(xline, beta_kq_8[0]  + beta_kq_8[1] * xline,
         color="cadetblue", lw=2, label="KQ-Methode (-8)")

# -------------------------------------------------------
# Labels „8“ und „11“ (falls vorhanden)
# -------------------------------------------------------

x_offset = 0.01 * (x_right - x_left)

if n >= 8:
    plt.text(x[7] + x_offset, y[7], "8",
             color="dimgrey", fontsize=9, ha="left", va="center")
if n >= 11:
    plt.text(x[10] + x_offset, y[10], "11",
             color="dimgrey", fontsize=9, ha="left", va="center")

# -------------------------------------------------------

plt.xlabel("Wechselkurs in EUR/USD")
plt.ylabel("Fremdwährungserlös in EUR")
plt.legend(loc="lower right", frameon=False)
plt.grid(True, alpha=0.25)
plt.tight_layout()
plt.show()

# ===========================
# 7) Einfluss-/Residuenplots (einfach)
# ===========================
# Vergleich Residuen KQ vs. MM
idx = np.arange(1, n+1)
plt.figure(figsize=(6,5))
plt.scatter(np.abs(resid_kq), np.abs(resid_mm), s=30)
plt.plot([0, max(np.abs(resid_kq).max(), np.abs(resid_mm).max())],
         [0, max(np.abs(resid_kq).max(), np.abs(resid_mm).max())],
         color="gray", alpha=0.6)
plt.xlabel("KQ-Residuen |e|")
plt.ylabel("MM-Residuen |e|")
plt.title("KQ vs. MM – Vergleich der Residuen")
for i in range(n):
    plt.text(np.abs(resid_kq)[i], np.abs(resid_mm)[i], str(i+1), fontsize=8)
plt.tight_layout()
plt.show()

# Gewichte der MM- und Huber-Schätzung
plt.figure(figsize=(8,4))
plt.subplot(1,2,1)
plt.plot(idx, w_mm, marker="o"); plt.title("Gewichte (MM)"); plt.xlabel("Beobachtung"); plt.ylabel("Gewicht"); plt.grid(True, alpha=0.2)
plt.subplot(1,2,2)
plt.plot(idx, w_huber, marker="o"); plt.title("Gewichte (Huber)"); plt.xlabel("Beobachtung"); plt.grid(True, alpha=0.2)
plt.tight_layout()
plt.show()

# Vergleich Residuen (KQ / Huber / MM)
plt.figure(figsize=(8,5))
plt.plot(idx, resid_kq,   "o-", label="KQ")
plt.plot(idx, resid_huber,"s-", label="Huber-M")
plt.plot(idx, resid_mm,   "^-", label="MM")
plt.axhline(0, color="gray", ls="--")
plt.xlabel("Beobachtung Nr."); plt.ylabel("Residualwert")
plt.title("Vergleich der Residuen")
plt.legend()
plt.grid(True, alpha=0.2)
plt.tight_layout()
plt.show()
# --------------------------------------------------------------------------------