# -*- coding: utf-8 -*-
"""
Created on Sat Oct  4 09:40:14 2025

@author: Moritz Romeike
"""

# --------------------------------------------------------------------------------
# Programmcode 32 (Python, minimal): Lineare Regression im Währungsrisikomanagement
# Abhängigkeiten: pandas, numpy, matplotlib
# --------------------------------------------------------------------------------
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import math

# ===========================
# 1) Daten laden & vorbereiten
# ===========================
csv_path = "data_linear_regression_USD.csv"
df = pd.read_csv(csv_path, sep=';', dtype=str, encoding='utf-8', engine='python')

# Trim & Dezimalkomma -> Punkt
df.columns = df.columns.str.strip()
for c in df.columns:
    df[c] = df[c].astype(str).str.strip()

def to_float(s: pd.Series) -> pd.Series:
    # entferne Tausenderpunkte, ersetze Komma durch Punkt
    return pd.to_numeric(s.str.replace(".", "", regex=False)
                           .str.replace(",", ".", regex=False),
                         errors="coerce")

for col in ("x", "y"):
    if col not in df.columns:
        raise KeyError(f"Spalte '{col}' fehlt. Vorhanden: {list(df.columns)}")
    df[col] = to_float(df[col])

if "X" not in df.columns:
    df["X"] = np.arange(1, len(df) + 1)

# ===========================
# 2) OLS (per Hand) : y = b0 + b1*x
# ===========================
x = df["x"].to_numpy(float)
y = df["y"].to_numpy(float)
n = len(y)

X = np.column_stack([np.ones(n), x])  # Designmatrix
p = X.shape[1]  # 2

# Koeffizienten
XtX = X.T @ X
XtX_inv = np.linalg.inv(XtX)
beta = XtX_inv @ (X.T @ y)  # [b0, b1]
b0, b1 = beta

# Fits & Residuen
yhat = X @ beta
resid = y - yhat

# SSE/SST/SSR, R2, adjR2, sigma
SSE = float(np.sum(resid**2))
SST = float(np.sum((y - y.mean())**2))
SSR = SST - SSE
R2 = SSR / SST if SST > 0 else np.nan
adjR2 = 1 - (SSE/(n - p)) / (SST/(n - 1)) if n > p else np.nan
sigma2 = SSE / (n - p)
sigma = math.sqrt(sigma2)

# Standardfehler, t-Werte (ohne p-Werte)
se_beta = np.sqrt(np.diag(sigma2 * XtX_inv))
tvals = beta / se_beta

# F-Statistik (ohne p-Wert)
F = (SSR/(p-1)) / (SSE/(n-p)) if p > 1 and n > p else np.nan

print("===== OLS (y ~ x) – ohne statsmodels =====")
print(f"b0 (Intercept): {b0:.3f}   SE: {se_beta[0]:.3f}   t: {tvals[0]:.3f}")
print(f"b1 (x)       : {b1:.3f}   SE: {se_beta[1]:.3f}   t: {tvals[1]:.3f}")
print(f"Residual SE: {sigma:.3f}   df: {n-p}")
print(f"R^2: {R2:.4f}   Adj. R^2: {adjR2:.4f}")
print(f"F-Stat: {F:.3f}")

# ===========================
# 3) Plots: Scatter + Linie
# ===========================
plt.figure()
plt.scatter(x, y, alpha=0.7, label="Daten")
xline = np.linspace(x.min(), x.max(), 100)
plt.plot(xline, b0 + b1 * xline, color="blue", label="Regressionsgerade")
plt.xlabel("Wechselkurs in EUR/USD")
plt.ylabel("Fremdwährungserlös in EUR")
plt.title("Streuungsdiagramm mit Regressionsgerade")
plt.legend()
plt.grid(True, alpha=0.2)
plt.tight_layout()
plt.show()

# ===========================
# 4) Residuenplots
# ===========================
# Residuen vs Fitted
plt.figure()
plt.scatter(yhat, resid, s=20, alpha=0.8)
plt.axhline(0, color="red", ls="--")
plt.xlabel("Fitted Values")
plt.ylabel("Residuen")
plt.title("Residuen vs. Fitted")
plt.grid(True, alpha=0.2)
plt.tight_layout()
plt.show()

# Histogramm der Residuen
plt.figure()
plt.hist(resid, bins=15, density=True, edgecolor="white")
plt.xlabel("Residuen")
plt.ylabel("Dichte")
plt.title("Histogramm der Residuen")
plt.tight_layout()
plt.show()

# Einfacher Q-Q-Plot (gegen Normal mit Varianz = sigma^2)
res_sorted = np.sort(resid)
probs = (np.arange(1, n+1) - 0.5) / n

# Approx. Inverse Normal CDF 
import math
def approx_norm_ppf(u):
    a1,a2,a3 = -39.6968302866538, 220.946098424521, -275.928510446969
    a4,a5,a6 = 138.357751867269, -30.6647980661472, 2.50662827745924
    b1,b2,b3 = -54.4760987982241, 161.585836858041, -155.698979859887
    b4,b5    = 66.8013118877197, -13.2806815528857
    c1,c2,c3 = -0.00778489400243029, -0.322396458041136, -2.40075827716184
    c4,c5,c6 = -2.54973253934373, 4.37466414146497, 2.93816398269878
    d1,d2,d3 = 0.00778469570904146, 0.32246712907004, 2.445134137143
    d4       = 3.75440866190742
    p_low = 0.02425
    if u < p_low:
        q = math.sqrt(-2*math.log(u))
        return (((((c1*q+c2)*q+c3)*q+c4)*q+c5)*q+c6) / (((((d1*q+d2)*q+d3)*q+d4)*q)+1)
    if u > 1 - p_low:
        q = math.sqrt(-2*math.log(1-u))
        return -(((((c1*q+c2)*q+c3)*q+c4)*q+c5)*q+c6) / (((((d1*q+d2)*q+d3)*q+d4)*q)+1)
    q = u - 0.5
    r = q*q
    return (((((a1*r+a2)*r+a3)*r+a4)*r+a5)*r+a6)*q / (((((b1*r+b2)*r+b3)*r+b4)*r+b5)*r+1)

theo = np.array([approx_norm_ppf(p) * sigma for p in probs])

plt.figure()
plt.scatter(theo, res_sorted, s=15)
mn, mx = min(theo.min(), res_sorted.min()), max(theo.max(), res_sorted.max())
plt.plot([mn, mx], [mn, mx], 'r--', lw=1)
plt.xlabel("Theoretische Quantile (N(0,σ²))")
plt.ylabel("Beobachtete Residuen")
plt.title("Q–Q–Plot der Residuen (Approx.)")
plt.tight_layout()
plt.show()

# ===========================
# 5) Durbin–Watson
# ===========================
dw = np.sum(np.diff(resid)**2) / np.sum(resid**2) if np.sum(resid**2) > 0 else np.nan
print(f"Durbin–Watson: {dw:.6f}")

# ===========================
# 6) Breusch–Pagan (LM-Stat)
# ===========================
# Auxiliary regression e^2 ~ 1 + x  -> LM = n * R^2
e2 = resid**2
Z = X  # [1, x]
beta_aux = np.linalg.lstsq(Z, e2, rcond=None)[0]
e2_hat = Z @ beta_aux
SSE_aux = np.sum((e2 - e2_hat)**2)
SST_aux = np.sum((e2 - e2.mean())**2)
R2_aux = 1 - SSE_aux/SST_aux if SST_aux > 0 else 0.0
BP_stat = n * R2_aux
print(f"Breusch–Pagan (LM): {BP_stat:.3f}   df = 1   (p-Wert ohne SciPy nicht berechnet)")

# ===========================
# 7) ACF der Residuen (einfach)
# ===========================
def acf_values(e, max_lag=20):
    e = np.asarray(e) - np.mean(e)
    denom = np.sum(e**2)
    acf = [1.0]
    for k in range(1, max_lag+1):
        num = np.sum(e[k:] * e[:-k])
        acf.append(num/denom if denom > 0 else 0.0)
    return np.array(acf)

acf_res = acf_values(resid, max_lag=20)
lags = np.arange(len(acf_res))

plt.figure()
plt.stem(lags, acf_res)   # kompatibel mit alten/neuen Matplotlib
plt.axhline(0, color="black", lw=1)
plt.xlabel("Lag")
plt.ylabel("ACF")
plt.title("Autokorrelation der Residuen")
plt.tight_layout()
plt.show()

# ===========================
# 8) Einflussgrößen: hat, rstandard, rstudent, Cook’s D
# ===========================
H = X @ XtX_inv @ X.T
hat = np.clip(np.diag(H), 0, 1)

rstandard = resid / (sigma * np.sqrt(1 - hat))

rstudent = np.zeros(n)
for i in range(n):
    sigma_i2 = (SSE - resid[i]**2 / (1 - hat[i])) / (n - p - 1)
    sigma_i = math.sqrt(max(sigma_i2, 1e-12))
    rstudent[i] = resid[i] / (sigma_i * math.sqrt(1 - hat[i]))

cooks = (resid**2 / (p * sigma2)) * (hat / (1 - hat)**2)

regression_info = pd.DataFrame({
    "X": df["X"].astype(int, errors="ignore"),
    "x": x, "y": y,
    "residuals": resid,
    "rstandard": rstandard,
    "rstudent": rstudent,
    "hat": hat,
    "cooks": cooks
})
print("\nregression_info (erste Zeile):")
print(regression_info.head(1).to_string(index=False))

# ===========================
# 9) Bereinigung & Refit 
# ===========================
# Schwellen: t_0.975 ~ 1.96 (Normal-Approx), hat < 0.016
tcrit_approx = 1.96
clean_mask = (np.abs(regression_info["rstudent"]) < tcrit_approx) & (regression_info["hat"] < 0.016)
df_clean = regression_info.loc[clean_mask, ["x", "y"]].copy()

Xc = np.column_stack([np.ones(len(df_clean)), df_clean["x"].to_numpy(float)])
betac = np.linalg.lstsq(Xc, df_clean["y"].to_numpy(float), rcond=None)[0]
yc_hat = Xc @ betac
resc = df_clean["y"].to_numpy(float) - yc_hat
SSEc = float(np.sum(resc**2))
sigma2c = SSEc / (len(df_clean) - p)
XtXc_inv = np.linalg.inv(Xc.T @ Xc)
se_betac = np.sqrt(np.diag(sigma2c * XtXc_inv))

print("\nOLS (bereinigte Daten):")
print(f"b0: {betac[0]:.3f}  (SE {se_betac[0]:.3f})")
print(f"b1: {betac[1]:.3f}  (SE {se_betac[1]:.3f})")

# ===========================
# 10) Konfidenz- & Prognosebänder (95%, Normal-Approx.)
# ===========================
def intervals_for(x_new_array, beta, Xmat, sigma2, XtX_inv, zcrit=1.96):
    x_new = np.column_stack([np.ones(len(x_new_array)), x_new_array])
    y_pred = x_new @ beta
    se_fit = np.sqrt(np.sum(x_new @ XtX_inv * x_new, axis=1) * sigma2)
    ci = np.column_stack([y_pred - zcrit*se_fit, y_pred + zcrit*se_fit])
    se_pred = np.sqrt(se_fit**2 + sigma2)
    pi = np.column_stack([y_pred - zcrit*se_pred, y_pred + zcrit*se_pred])
    return y_pred, ci, pi

key_figures = np.arange(0.8021, 1.0619 + 1e-9, 0.05)
yp, ci, pi = intervals_for(key_figures, beta, X, sigma2, XtX_inv, zcrit=1.96)

plt.figure()
plt.scatter(x, y, s=18, alpha=0.8)
plt.plot(xline, b0 + b1*xline, color="blue", lw=1.5, label="Regressionsgerade")
plt.plot(key_figures, ci[:,0], "b--", lw=1.5, label="95% KI")
plt.plot(key_figures, ci[:,1], "b--", lw=1.5)
plt.plot(key_figures, pi[:,0], "k--", lw=1.2, label="95% PI")
plt.plot(key_figures, pi[:,1], "k--", lw=1.2)
plt.xlabel("Wechselkurs in EUR/USD")
plt.ylabel("Umsatz aus Fremdwährungsgeschäften in EUR")
plt.legend(loc="upper left")
plt.tight_layout()
plt.show()

# Punktvorhersage bei x=0.8696
value = np.array([0.8696], dtype=float)
y_point, ci_v, pi_v = intervals_for(value, beta, X, sigma2, XtX_inv, zcrit=1.96)
print("\nVorhersage bei x=0.8696:")
print(f"fit: {y_point[0]:.2f}")
print(f"95%-KI: [{ci_v[0,0]:.2f}, {ci_v[0,1]:.2f}]")
print(f"95%-PI: [{pi_v[0,0]:.2f}, {pi_v[0,1]:.2f}]")

# ===========================
# 11) Regressionsvergleich ohne Beob. 51 & 249
# ===========================
def fit_drop(idx_1based):
    i = int(idx_1based) - 1
    if 0 <= i < n:
        mask = np.ones(n, dtype=bool); mask[i] = False
        X_ = X[mask]; y_ = y[mask]
        return np.linalg.lstsq(X_, y_, rcond=None)[0]
    return beta

beta_51  = fit_drop(51)
beta_249 = fit_drop(249)

plt.figure()
plt.scatter(x, y, s=18, alpha=0.8)
plt.plot(xline, b0 + b1*xline, color="blue",  lw=2, label="gesamt")
plt.plot(xline, beta_51[0]  + beta_51[1]*xline,  color="black", lw=2, label="ohne 51")
plt.plot(xline, beta_249[0] + beta_249[1]*xline, color="green", lw=2, label="ohne 249")
plt.axvline(float(np.mean(x)), ls="--", color="grey", label="Mittelwert x")
if 0 <= 50 < n: plt.text(x[50],  y[50],  "51",  color="dimgrey")
if 0 <= 248 < n: plt.text(x[248], y[248], "249", color="dimgrey")
plt.xlabel("Wechselkurs in EUR/USD"); plt.ylabel("Umsatz in EUR")
plt.legend(loc="upper left"); plt.tight_layout(); plt.show()
# --------------------------------------------------------------------------------