# -*- coding: utf-8 -*-
"""
Created on Sat Oct  4 09:18:38 2025

@author: Moritz Romeike
"""

# --------------------------------------------------------------------------------
# Programmcode 29 (Python): K-Means zur Betrugserkennung im Einkaufsprozess
# --------------------------------------------------------------------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

# ========== Pfad anpassen (read.csv2-Style: sep=';') ============================
csv_path = Path(r"daten_kmeans_clustering.csv")

# ========== Daten laden ==========================================================
df = pd.read_csv(csv_path, sep=';', dtype=str, encoding='utf-8', engine='python')

# Spalten trimmen und numerisch konvertieren (Dezimalkomma -> Punkt)
df.columns = df.columns.str.strip()
for c in df.columns:
    df[c] = df[c].astype(str).str.strip()

# Erwartete Spalten wie im R-Beispiel
num_cols = ["LieferantenID", "Zeit", "Rechnungsbetrag"]
missing = [c for c in num_cols if c not in df.columns]
if missing:
    raise KeyError(f"Fehlende Spalten {missing}. Vorhanden: {list(df.columns)}")

# Dezimalkomma tolerant konvertieren
def to_float_series(s: pd.Series) -> pd.Series:
    return pd.to_numeric(s.str.replace(".", "", regex=False).str.replace(",", ".", regex=False), errors="coerce")

for c in num_cols:
    df[c] = to_float_series(df[c])

data = df[num_cols].copy()

# ========== Deskriptive Statistik ===============================================
print("\n--- Deskriptive Statistik ---")
print(data.describe().T.round(6))
print("\nStandardabweichungen:")
for c in num_cols:
    print(f"{c}: {data[c].std(ddof=1):.4f}")

# ========== Korrelationen + Scatter-Matrix ======================================
corr = data.corr(method="pearson")
print("\nKorrelationsmatrix (Pearson):")
print(corr.round(4))

# Scatter-Matrix einfach
pd.plotting.scatter_matrix(data, figsize=(7, 7), diagonal='hist')
plt.suptitle("Scatter-Matrix & Histogramme", y=0.93)
plt.tight_layout()
plt.show()

# Heatmap der Korrelationen
plt.figure(figsize=(5,4))
plt.imshow(corr, cmap="coolwarm", vmin=-1, vmax=1)
plt.colorbar(label="Korrelationskoeff.")
plt.xticks(range(len(num_cols)), num_cols, rotation=45, ha='right')
plt.yticks(range(len(num_cols)), num_cols)
plt.title("Korrelationsmatrix")
plt.tight_layout()
plt.show()

# ========== Standardisierung (z-Score) ==========================================
X = data.to_numpy(dtype=float)
mu = np.nanmean(X, axis=0)
sigma = np.nanstd(X, axis=0, ddof=1)
sigma[sigma == 0] = 1.0
Xz = (X - mu) / sigma

print("\nErste Zeilen des z-standardisierten Datensatzes:")
print(pd.DataFrame(Xz, columns=num_cols).head())

# ========== Distanzmatrix & Visualisierung (wie fviz_dist) ======================
def euclidean_distance_matrix(Z: np.ndarray) -> np.ndarray:
    # quadratische Form: ||a-b||^2 = ||a||^2 + ||b||^2 - 2 a.b
    G = Z @ Z.T
    sq = np.diag(G)
    D2 = sq[:, None] + sq[None, :] - 2*G
    D2[D2 < 0] = 0.0
    return np.sqrt(D2)

D = euclidean_distance_matrix(Xz)

plt.figure(figsize=(5,4))
plt.imshow(D, cmap="RdYlBu_r")
plt.colorbar(label="Euclid. Distanz")
plt.title("Distanzmatrix (z-standardisiert)")
plt.tight_layout()
plt.show()

# ========== K-Means (k-means++) =================================================
def kmeans_pp_init(Z: np.ndarray, k: int, rng=np.random.default_rng(123)) -> np.ndarray:
    n = Z.shape[0]
    centers = []
    # erstes Zentrum zufällig
    centers.append(rng.integers(0, n))
    # weitere Zentren mit D^2-gewichteter Auswahl
    for _ in range(1, k):
        d2 = np.min(euclidean_distance_matrix(Z[centers])[:, :], axis=1) if len(centers)>0 else np.ones(n)
        # ^ Trick: wir brauchen Distanz jedes Punkts zu existierenden Zentren -> effizienter:
        # Rechne direkt: Distanz zu min Center
        d2 = np.min(((Z - Z[centers[0]])**2).sum(axis=1, keepdims=True), axis=1)
        for c in centers[1:]:
            d2 = np.minimum(d2, ((Z - Z[c])**2).sum(axis=1))
        probs = d2 / d2.sum() if d2.sum() > 0 else np.ones(n)/n
        centers.append(rng.choice(n, p=probs))
    return np.array(centers, dtype=int)

def kmeans_fit(Z: np.ndarray, k: int, n_init: int = 25, max_iter: int = 300, tol: float = 1e-6,
               rng_seed: int = 123):
    rng = np.random.default_rng(rng_seed)
    best_inertia = None
    best = None

    for _ in range(n_init):
        # k-means++ init
        idx = kmeans_pp_init(Z, k, rng)
        C = Z[idx].copy()
        for it in range(max_iter):
            # Zugehörigkeit
            # (Z-C)^2 norm -> wähle min
            # Distanz zu Centern
            dists = np.linalg.norm(Z[:, None, :] - C[None, :, :], axis=2)  # (n,k)
            labels = np.argmin(dists, axis=1)
            # Neue Zentren
            C_new = np.vstack([Z[labels==j].mean(axis=0) if np.any(labels==j) else C[j]
                               for j in range(k)])
            # Konvergenz?
            shift = np.linalg.norm(C_new - C)
            C = C_new
            if shift < tol:
                break

        # Inertia (Sum of squared distances)
        inertia = np.sum((Z - C[labels])**2)
        if (best_inertia is None) or (inertia < best_inertia):
            best_inertia = inertia
            best = {"labels": labels, "centers": C, "inertia": inertia}

    # zusätzliche Kennzahlen
    withinss = np.array([np.sum((Z[best["labels"]==j] - best["centers"][j])**2)
                         for j in range(k)], dtype=float)
    size = np.array([(best["labels"]==j).sum() for j in range(k)], dtype=int)
    totss = np.sum((Z - Z.mean(axis=0))**2)
    betweenss = totss - withinss.sum()

    return {
        "cluster": best["labels"] + 1,  # 1..k (R-Stil)
        "centers": best["centers"],
        "totss": float(totss),
        "withinss": withinss,
        "tot_withinss": float(withinss.sum()),
        "betweenss": float(betweenss),
        "size": size,
        "inertia": float(best["inertia"])
    }

# ========== K-Wahl: Elbow (WSS) & Silhouette ===================================
def silhouette_values(D: np.ndarray, labels0: np.ndarray) -> np.ndarray:
    """Silhouettenwerte aus Distanzmatrix; labels0: 0..k-1"""
    n = D.shape[0]
    s = np.zeros(n, dtype=float)
    k = labels0.max() + 1
    idx_by_c = [np.where(labels0==c)[0] for c in range(k)]
    for i in range(n):
        ci = labels0[i]
        own = idx_by_c[ci]
        a = np.mean(D[i, own[own!=i]]) if own.size>1 else 0.0
        b = np.inf
        for c in range(k):
            if c == ci or idx_by_c[c].size == 0: 
                continue
            b = min(b, np.mean(D[i, idx_by_c[c]]))
        if b == np.inf: 
            s[i] = 0.0
        else:
            denom = max(a, b)
            s[i] = (b - a) / denom if denom > 0 else 0.0
    return s

# Silhouette & Elbow-Kurven
Ks = range(2, 9)
avg_sil = []
wss = []

for k in Ks:
    km = kmeans_fit(Xz, k=k, n_init=25, max_iter=300, tol=1e-6, rng_seed=123)
    wss.append(km["tot_withinss"])
    labs0 = (km["cluster"] - 1)  # 0..k-1
    svals = silhouette_values(D, labs0)
    avg_sil.append(np.mean(svals))

plt.figure(figsize=(6,4))
plt.plot(list(Ks), avg_sil, marker='o')
plt.xlabel("Anzahl der Cluster (K)")
plt.ylabel("Mittlerer Silhouette-Koeffizient")
plt.title("Silhouette-Methode zur Wahl von K")
plt.grid(True, linestyle="--", alpha=0.6)
plt.tight_layout()
plt.show()

plt.figure(figsize=(6,4))
plt.plot(list(Ks), wss, marker='o')
plt.xlabel("Anzahl der Cluster (K)")
plt.ylabel("Totale Within-SS (WSS)")
plt.title("Elbow-Methode (WSS)")
plt.grid(True, linestyle="--", alpha=0.6)
plt.tight_layout()
plt.show()

best_k_sil = list(Ks)[int(np.argmax(avg_sil))]
print(f"\nBeste K (Silhouette): {best_k_sil} mit Ø Silhouette = {max(avg_sil):.3f}")

# (Optional) Gap-Statistik wäre ebenfalls möglich, ist aber aufwändiger – 
# für ein schlankes Skript reichen Silhouette & Elbow in der Praxis oft aus.

# ========== K-Means für K=3 (analog R-Beispiel) ================================
km3 = kmeans_fit(Xz, k=3, n_init=25, max_iter=300, tol=1e-6, rng_seed=123)
print("\n--- K-means mit K=3 ---")
print(f"Clustergrößen: {km3['size']}")
print(f"WithinSS je Cluster: {km3['withinss'].round(4)}")
print(f"(between_SS / total_SS) = {km3['betweenss']/km3['totss']*100:.1f} %")
print("Clusterzentren (z-standardisiert):")
print(pd.DataFrame(km3["centers"], columns=num_cols).round(4))

# Silhouette für K=3 ausgeben & Plot
s3 = silhouette_values(D, km3["cluster"] - 1)
print(f"Ø Silhouette (K=3): {np.mean(s3):.3f}")

def silhouette_barplot(vals: np.ndarray, labels1: np.ndarray, title: str):
    order = np.argsort(labels1)
    v = vals[order]
    l = labels1[order]
    plt.figure(figsize=(6,5))
    y0 = 0; yt=[]; ylab=[]
    for c in np.unique(l):
        seg = v[l==c]
        seg = np.sort(seg)
        y = np.arange(y0, y0+len(seg))
        plt.barh(y, seg, height=1.0)
        yt.append(y0 + len(seg)/2); ylab.append(f"Cluster {c}")
        y0 += len(seg) + 2
    plt.axvline(np.mean(vals), color='red', linestyle='--', linewidth=1, label=f"Ø = {np.mean(vals):.3f}")
    plt.yticks(yt, ylab); plt.xlabel("Silhouette"); plt.title(title); plt.legend()
    plt.tight_layout(); plt.show()

silhouette_barplot(s3, km3["cluster"], "Silhouette-Plot (K=3)")

# ========== Visualisierung der Cluster (PCA 2D) ================================
def pca_2d(Z: np.ndarray):
    Zc = Z - Z.mean(axis=0, keepdims=True)
    U, S, Vt = np.linalg.svd(Zc, full_matrices=False)
    PC = U[:, :2] * S[:2]
    return PC

PC = pca_2d(Xz)

def scatter_clusters(PC: np.ndarray, labels1: np.ndarray, title: str):
    k_values = np.unique(labels1)
    plt.figure(figsize=(6,5))
    for k in k_values:
        mask = labels1 == k
        plt.scatter(PC[mask, 0], PC[mask, 1], s=50, label=f"Cluster {k}", alpha=0.8)
    plt.xlabel("PC1"); plt.ylabel("PC2"); plt.title(title)
    plt.legend(); plt.grid(True, linestyle="--", alpha=0.5)
    plt.tight_layout(); plt.show()

scatter_clusters(PC, km3["cluster"], "K-Means Clusters (K=3) – PCA-Projektion")

# ========== Vergleichsplots K=2,3,4,5 (jeweils eigener Plot) ====================
for k in [2,3,4,5]:
    kmk = kmeans_fit(Xz, k=k, n_init=25, max_iter=300, tol=1e-6, rng_seed=123)
    scatter_clusters(PC, kmk["cluster"], f"K-Means Clusters (K={k}) – PCA-Projektion")

# ========== Boxplots wie im R-Code (für K=4) ====================================
km4 = kmeans_fit(Xz, k=4, n_init=25, max_iter=300, tol=1e-6, rng_seed=123)
df_box = data.copy()
df_box["cluster"] = km4["cluster"]

plt.figure(); df_box.boxplot(column="LieferantenID", by="cluster", grid=False)
plt.title("LieferantenID ~ cluster"); plt.suptitle(""); plt.xlabel("cluster"); plt.ylabel("LieferantenID"); plt.tight_layout(); plt.show()

plt.figure(); df_box.boxplot(column="Rechnungsbetrag", by="cluster", grid=False)
plt.title("Rechnungsbetrag ~ cluster"); plt.suptitle(""); plt.xlabel("cluster"); plt.ylabel("Rechnungsbetrag"); plt.tight_layout(); plt.show()

plt.figure(); df_box.boxplot(column="Zeit", by="cluster", grid=False)
plt.title("Zeit ~ cluster"); plt.suptitle(""); plt.xlabel("cluster"); plt.ylabel("Zeit"); plt.tight_layout(); plt.show()
# --------------------------------------------------------------------------------