# -*- coding: utf-8 -*-
"""
Created on Fri Oct  3 12:50:14 2025

@author: Moritz Romeike
"""

# ------------------------------------------------------------------------
# Programmcode 21 (Python): Scatterplot mit Pearson-, partieller & multipler Korrelation
# ------------------------------------------------------------------------
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

# Optional: p-Wert für Pearson, falls SciPy verfügbar
try:
    from scipy import stats
    HAS_SCIPY = True
except Exception:
    HAS_SCIPY = False

# === Daten einlesen ===
base = Path(__file__).resolve().parent
df = pd.read_excel(base / "Kap_4.8.2_dioden_preis_lebensdauer.xlsx")

# Spaltennamen wie in R (mit Sonderzeichen)
col_price = "Preis (€)"
col_life  = "Lebensdauer (Stunden)"
col_temp  = "Temperaturbelastung (°C)"

# Nur die benötigten Spalten, fehlende entfernen
df = df[[col_price, col_life, col_temp]].copy()
df = df.dropna()

# In numpy-Arrays
x = pd.to_numeric(df[col_price], errors="coerce").to_numpy()
y = pd.to_numeric(df[col_life],  errors="coerce").to_numpy()
z = pd.to_numeric(df[col_temp],  errors="coerce").to_numpy()

mask = ~np.isnan(x) & ~np.isnan(y) & ~np.isnan(z)
x, y, z = x[mask], y[mask], z[mask]
n = len(x)

# === Pearson-Korrelation (r, p) ===
if HAS_SCIPY:
    r_pearson, p_pearson = stats.pearsonr(x, y)
else:
    r_pearson = float(np.corrcoef(x, y)[0, 1])
    p_pearson = None  # ohne SciPy kein p-Wert aus der t-Verteilung

# === Partielle Korrelation r_{xy·z} via Residuenmethode ===
# Schritt 1: Residuen von X~Z
Z = np.column_stack([np.ones(n), z])               # Designmatrix mit Intercept
beta_x, *_ = np.linalg.lstsq(Z, x, rcond=None)
x_hat = Z @ beta_x
res_x = x - x_hat

# Schritt 2: Residuen von Y~Z
beta_y, *_ = np.linalg.lstsq(Z, y, rcond=None)
y_hat = Z @ beta_y
res_y = y - y_hat

# Schritt 3: Pearson r der Residuen
r_partial = float(np.corrcoef(res_x, res_y)[0, 1])

# === Multiple Korrelation (R) aus Regression Y ~ X + Z ===
XZ = np.column_stack([np.ones(n), x, z])
beta_m, *_ = np.linalg.lstsq(XZ, y, rcond=None)
y_pred = XZ @ beta_m
ss_res = np.sum((y - y_pred) ** 2)
ss_tot = np.sum((y - y.mean()) ** 2)
r_squared = 1.0 - ss_res / ss_tot if ss_tot > 0 else np.nan
r_multiple = float(np.sqrt(max(r_squared, 0.0)))

# === Plot: Scatter + Regressionslinie (OLS) ===
plt.figure(figsize=(7,5))
plt.scatter(x, y, alpha=0.7, label="Daten", s=40)

# Einfache Regressionslinie Y ~ X (ohne Z) für die Visualisierung
a, b = np.polyfit(x, y, deg=1)  # y = a*x + b
xs = np.linspace(x.min(), x.max(), 200)
plt.plot(xs, a*xs + b, linewidth=2, label="OLS-Linie")

plt.title("Scatterplot: Preis vs. Lebensdauer der Dioden")
plt.xlabel(col_price)
plt.ylabel(col_life)
plt.legend()

# Annotation unten rechts
ann_lines = [
    f"Pearson: {r_pearson:.3f}",
    f"Partielle Korr.: {r_partial:.3f}",
    f"Multiple Korr.: {r_multiple:.3f}"
]
if p_pearson is not None:
    ann_lines.append(f"p-Wert (Pearson): {p_pearson:.5f}")
ann_text = "\n".join(ann_lines)

x_pos = x.max()
y_pos = y.min()
plt.annotate(ann_text, xy=(x_pos, y_pos), xycoords="data",
             xytext=(-10, 10), textcoords="offset points",
             ha="right", va="bottom")

plt.tight_layout()
plt.show()

# === Konsolen-Ausgabe ===
print(f"Pearson-Korrelation: {r_pearson:.3f}")
print(f"p-Wert (Pearson): {p_pearson:.5f}" if p_pearson is not None else "p-Wert (Pearson): (SciPy nicht installiert)")
print(f"Partielle Korrelation: {r_partial:.3f}")
print(f"Multiple Korrelation:  {r_multiple:.3f}")
# ------------------------------------------------------------------------
