# -*- coding: utf-8 -*-
"""
Created on Sat Oct  4 10:19:14 2025

@author: Moritz Romeike
"""

# --------------------------------------------------------------------
# Programmcode 40 (Python): SVM mit radialem Kernel – Kundensegmentierung
# Repliziert die ggplot-Grafik aus dem R-Beispiel
# --------------------------------------------------------------------

import sys, subprocess, importlib
def ensure(mod, pip_name=None):
    try: return importlib.import_module(mod)
    except ModuleNotFoundError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", pip_name or mod])
        return importlib.import_module(mod)

np  = ensure("numpy")
pd  = ensure("pandas")
mpl = ensure("matplotlib")
skl = ensure("sklearn", "scikit-learn")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from matplotlib.lines import Line2D
from matplotlib.patches import Patch
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix

# ------------------------------ Daten ---------------------------------
rng = np.random.default_rng(42)
n = 150
seg_order = ["preisbewusst", "qualitätsorientiert", "gewerblich"]
Segment = np.repeat(seg_order, n//3)

kunden = pd.DataFrame({
    "Segment": Segment,
    "Bestellvolumen": np.r_[rng.normal(80, 10, n//3),
                            rng.normal(60, 10, n//3),
                            rng.normal(100,15, n//3)],
    "Preis_pro_Einheit": np.r_[rng.normal(2.0, 0.2, n//3),
                               rng.normal(3.0, 0.3, n//3),
                               rng.normal(2.5, 0.2, n//3)],
    "Bestellhaeufigkeit": np.r_[rng.normal(5, 1, n//3),
                                rng.normal(8, 1, n//3),
                                rng.normal(3, 1, n//3)],
    "Ruecksendequote": np.r_[rng.normal(0.05, 0.01,  n//3),
                             rng.normal(0.02, 0.005, n//3),
                             rng.normal(0.08, 0.015, n//3)],
    "Kundenzufriedenheit": np.r_[rng.normal(7, 1, n//3),
                                 rng.normal(9, 1, n//3),
                                 rng.normal(6, 1, n//3)],
})
kunden["Segment"] = pd.Categorical(kunden["Segment"], categories=seg_order, ordered=True)

features = ["Bestellvolumen","Preis_pro_Einheit","Bestellhaeufigkeit","Ruecksendequote","Kundenzufriedenheit"]
X = kunden[features].to_numpy(float)
y = kunden["Segment"].astype(str).to_numpy()

# Split wie caret::createDataPartition (stratifiziert, 70/30) mit festem Seed
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.30, stratify=y, random_state=123)

# --------------------------- SVM --------------------------------
# R: scale=TRUE, kernel=radial, cost=1, gamma=1/p
p = X_tr.shape[1]
svm_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("svc", SVC(kernel="rbf", C=1.0, gamma="auto",  # gamma='auto' = 1/p
                probability=True, random_state=123))
])
svm_pipe.fit(X_tr, y_tr)

# ------------------------ Konfusionsmatrix (optional) --------------------
y_pred = svm_pipe.predict(X_te)
cm = confusion_matrix(y_te, y_pred, labels=seg_order)
print(pd.DataFrame(cm, index=seg_order, columns=seg_order))

# -------------------- Grid für Entscheidungsfläche -----------------------
ix_x = features.index("Bestellvolumen")
ix_y = features.index("Preis_pro_Einheit")

xrange = np.linspace(X_tr[:, ix_x].min() - 10, X_tr[:, ix_x].max() + 10, 300)
yrange = np.linspace(X_tr[:, ix_y].min() - 0.5, X_tr[:, ix_y].max() + 0.5, 300)
XX, YY = np.meshgrid(xrange, yrange)

# übrige Variablen auf Trainingsmittel setzen (im Originalraum; Pipeline skaliert intern)
grid = np.tile(X_tr.mean(axis=0), (XX.size, 1))
grid[:, ix_x] = XX.ravel()
grid[:, ix_y] = YY.ravel()

Z = svm_pipe.predict(grid).reshape(XX.shape)

# ------------------------------ Farben -----------------------------------
fill_colors = {
    "preisbewusst":       "#66c2a5",
    "gewerblich":         "#fc8d62",
    "qualitätsorientiert":"#8da0cb"
}
point_colors = fill_colors.copy()
cmap_fill = ListedColormap([fill_colors[c] for c in seg_order])

Z_codes = pd.Categorical(Z.ravel(), categories=seg_order, ordered=True).codes.reshape(Z.shape)

# -------------------------------- Plot -----------------------------------
plt.figure(figsize=(12.5, 6.8))
# Flächen (Vorhersage)
plt.contourf(XX, YY, Z_codes, levels=len(seg_order), cmap=cmap_fill, alpha=0.30)

# Trainingspunkte (Segment)
for cls in seg_order:
    mask = (y_tr == cls)
    plt.scatter(X_tr[mask, ix_x], X_tr[mask, ix_y],
                s=32, color=point_colors[cls], edgecolor="white", linewidths=0.6)

plt.title("SVM mit radialem Kernel – Inntal AG", fontsize=16)
plt.xlabel("Bestellvolumen")
plt.ylabel("Preis pro Einheit (EUR)")
plt.grid(True, alpha=0.15)

# Legenden (Segment / Vorhersage) 
seg_handles = [Line2D([0],[0], marker='o', linestyle='', markersize=8,
                      markerfacecolor=point_colors[c], markeredgecolor="white", label=c)
               for c in seg_order]
leg1 = plt.legend(handles=seg_handles, title="Segment",
                  loc="upper right", bbox_to_anchor=(1.30, 1.00), frameon=True)

fill_handles = [Patch(facecolor=fill_colors[c], edgecolor='none', alpha=0.30, label=c)
                for c in seg_order]
leg2 = plt.legend(handles=fill_handles, title="Vorhersage",
                  loc="upper right", bbox_to_anchor=(1.30, 0.72), frameon=True)

plt.gca().add_artist(leg1)
plt.tight_layout()
plt.show()
# --------------------------------------------------------------------