intial commit (forked from private repo)

2025-04-11 11:08:28 +02:00
commit 3bdd37f46c
154 changed files with 45901 additions and 0 deletions
--- a/extracted_cells/cell0.py
+++ b/extracted_cells/cell0.py
@ -0,0 +1 @@
+%pip install scikit-learn matplotlib seaborn pandas numpy lime
--- a/extracted_cells/cell1.py
+++ b/extracted_cells/cell1.py
@ -0,0 +1,15 @@
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.model_selection import train_test_split, GridSearchCV
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
+from sklearn.preprocessing import LabelEncoder
+from sklearn.tree import plot_tree
+
+# Daten einlesen
+df = pd.read_csv("./sample_data/adult_census_income/adult.csv")
+
+# Anzeigen der ersten Zeilen des Datensatzes
+df.head(10)
--- a/extracted_cells/cell10.py
+++ b/extracted_cells/cell10.py
@ -0,0 +1,11 @@
+# Features und Zielwerte definieren
+X = df_encoded.drop(['income', 'income_encoded'], axis=1)
+y = df_encoded['income_encoded']
+
+
+# Daten in Trainings- und Testdaten aufteilen
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
+
+print(f"Trainingsdaten: {X_train.shape[0]} Beispiele")
+print(f"Testdaten: {X_test.shape[0]} Beispiele")
+print(f"Features: {X_train.shape[1]} Merkmale")
--- a/extracted_cells/cell11.py
+++ b/extracted_cells/cell11.py
@ -0,0 +1,17 @@
+# Random Forest Modell trainieren
+rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
+rf_model.fit(X_train, y_train)
+
+# Vorhersagen für die Testdaten
+y_pred = rf_model.predict(X_test)
+
+# Modellleistung evaluieren
+accuracy = accuracy_score(y_test, y_pred)
+precision = precision_score(y_test, y_pred)
+recall = recall_score(y_test, y_pred)
+f1 = f1_score(y_test, y_pred)
+
+print(f"Accuracy: {accuracy:.4f}")
+print(f"Precision: {precision:.4f}")
+print(f"Recall: {recall:.4f}")
+print(f"F1 Score: {f1:.4f}")
--- a/extracted_cells/cell12.py
+++ b/extracted_cells/cell12.py
@ -0,0 +1,16 @@
+
+# Confusion Matrix
+cm = confusion_matrix(y_test, y_pred)
+plt.figure(figsize=(8, 6))
+sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
+            xticklabels=['<=50K', '>50K'],
+            yticklabels=['<=50K', '>50K'])
+plt.xlabel('Vorhergesagt')
+plt.ylabel('Tatsächlich')
+plt.title('Confusion Matrix')
+plt.savefig('output/Confusions_Matrix.png', dpi=300)
+plt.show()
+
+# Klassifikationsbericht
+print("\nKlassifikationsbericht:")
+print(classification_report(y_test, y_pred, target_names=['<=50K', '>50K']))
--- a/extracted_cells/cell13.py
+++ b/extracted_cells/cell13.py
@ -0,0 +1,12 @@
+# Visualisierung eines einzelnen Entscheidungsbaums aus dem Random Forest
+plt.figure(figsize=(25, 12))
+tree_to_plot = rf_model.estimators_[0]  # Ersten Baum aus dem Forest auswählen
+plot_tree(tree_to_plot, 
+          feature_names=X_train.columns,
+          class_names=['<=50K', '>50K'],
+          filled=True,
+          rounded=True,
+          fontsize=10,
+          max_depth=3)
+plt.savefig('output/Random_Forest_Tree_Example.png', dpi=300)
+plt.show()
--- a/extracted_cells/cell14.py
+++ b/extracted_cells/cell14.py
@ -0,0 +1,12 @@
+# Feature Importance
+feature_importance = pd.DataFrame({
+    'Feature': X_train.columns,
+    'Importance': rf_model.feature_importances_
+}).sort_values('Importance', ascending=False)
+
+plt.figure(figsize=(12, 8))
+sns.barplot(x='Importance', y='Feature', data=feature_importance.head(15))
+plt.title('Feature Importance')
+plt.tight_layout()
+plt.savefig('output/Feature_Importance.png', dpi=300)
+plt.show()
--- a/extracted_cells/cell15.py
+++ b/extracted_cells/cell15.py
@ -0,0 +1,33 @@
+# Hyperparameter-Grid definieren
+param_grid = {
+    'n_estimators': [50, 100],
+    'max_depth': [None, 10, 20],
+    'min_samples_split': [2, 5],
+    'min_samples_leaf': [1, 2]
+}
+
+# GridSearchCV
+grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, scoring='accuracy')
+grid_search.fit(X_train, y_train)
+
+# Beste Parameter
+print("Beste Parameter:")
+print(grid_search.best_params_)
+
+# Bestes Modell
+best_rf_model = grid_search.best_estimator_
+
+# Vorhersagen mit dem besten Modell
+y_pred_best = best_rf_model.predict(X_test)
+
+# Modellleistung evaluieren
+accuracy_best = accuracy_score(y_test, y_pred_best)
+precision_best = precision_score(y_test, y_pred_best)
+recall_best = recall_score(y_test, y_pred_best)
+f1_best = f1_score(y_test, y_pred_best)
+
+print(f"\nBestes Modell:")
+print(f"Accuracy: {accuracy_best:.4f}")
+print(f"Precision: {precision_best:.4f}")
+print(f"Recall: {recall_best:.4f}")
+print(f"F1 Score: {f1_best:.4f}")
--- a/extracted_cells/cell16.py
+++ b/extracted_cells/cell16.py
@ -0,0 +1,17 @@
+# LIME für Erklärbarkeit
+from lime import lime_tabular
+import random
+
+# Erstelle einen LIME-Erklärer
+lime_explainer = lime_tabular.LimeTabularExplainer(
+    X_train.values,
+    feature_names=X_train.columns,
+    class_names=['<=50K', '>50K'],
+    mode='classification',
+    random_state=42
+)
+
+# Wähle ein zufälliges Beispiel aus den Testdaten
+random_idx = random.randint(0, len(X_test) - 1)
+instance_df = X_test.iloc[random_idx:random_idx+1]
+instance = instance_df.values[0]  # Für LIME benötigen wir das Array
--- a/extracted_cells/cell17.py
+++ b/extracted_cells/cell17.py
@ -0,0 +1,65 @@
+# Erkläre die Vorhersage mit LIME
+
+def analyze_lime_feature_importance(instance, rf_model, lime_explainer, num_features=5):
+    """
+    Analysiert die Feature-Wichtigkeiten der LIME-Erklärung.
+    """
+    # LIME-Erklärung generieren
+    exp = lime_explainer.explain_instance(
+        instance,
+        rf_model.predict_proba,
+        num_features=num_features
+    )
+    
+    # Random Forest Vorhersage (nur zur Information)
+    feature_names = rf_model.feature_names_in_
+    instance_df = pd.DataFrame([instance], columns=feature_names)
+    rf_prediction = rf_model.predict_proba(instance_df)[0, 1]
+    
+    # Feature-Wichtigkeiten aus LIME extrahieren
+    feature_importance = exp.as_list()
+    
+    # Visualisierung der Feature-Wichtigkeiten
+    plt.figure(figsize=(10, 6))
+    
+    # Features und ihre Wichtigkeiten trennen
+    features, importances = zip(*feature_importance)
+    
+    # Balkendiagramm erstellen
+    colors = ['red' if imp < 0 else 'green' for imp in importances]
+    y_pos = np.arange(len(features))
+    
+    plt.barh(y_pos, importances, color=colors)
+    plt.yticks(y_pos, features)
+    
+    plt.xlabel('Feature-Einfluss')
+    plt.title('LIME Feature-Wichtigkeiten')
+    
+    # Vertikale Linie bei 0 für bessere Lesbarkeit
+    plt.axvline(x=0, color='black', linestyle='-', alpha=0.3)
+    
+    plt.grid(alpha=0.3)
+    plt.tight_layout()
+    plt.savefig('output/lime_feature_importance.png', dpi=300)
+    plt.show()
+    
+    # Ausgabe der Ergebnisse
+    print("\nLIME Feature-Wichtigkeiten Analyse:")
+    print("-" * 50)
+    print(f"Random Forest Vorhersage für diese Instanz: {rf_prediction:.4f}")
+    print("\nFeature-Einflüsse:")
+    for feature, importance in feature_importance:
+        print(f"{feature}: {importance:+.4f}")
+    
+    return {
+        'rf_prediction': rf_prediction,
+        'feature_importance': dict(feature_importance)
+    }
+
+# Analysiere wie gut LIME die RF-Vorhersage erklärt
+importance_analysis = analyze_lime_feature_importance(
+    instance=instance,
+    rf_model=best_rf_model,
+    lime_explainer=lime_explainer,
+    num_features=20
+)
--- a/extracted_cells/cell18.py
+++ b/extracted_cells/cell18.py
@ -0,0 +1,49 @@
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+
+# Wählen wir zwei Features zur Variation:
+less_important_feature_education = "education.num"
+less_important_feature_fnlwgt = "fnlwgt" 
+
+# Festlegen derRange für die Variation der zwei Features
+education_range = np.linspace(instance_df[less_important_feature_education].values[0] - 10, instance_df[less_important_feature_education].values[0] + 10, 50)
+fnlwgt_range = np.linspace(instance_df[less_important_feature_fnlwgt].values[0] - 100000, instance_df[less_important_feature_fnlwgt].values[0] + 100000, 50)
+
+# Erstellen von Instanzen für LIME
+instances_education = pd.DataFrame([instance] * len(education_range), columns=X_train.columns)
+instances_fnlwgt = pd.DataFrame([instance] * len(fnlwgt_range), columns=X_train.columns)
+
+# Ändern der Feature-Werte in den Instanzen
+instances_education[less_important_feature_education] = education_range
+instances_fnlwgt[less_important_feature_fnlwgt] = fnlwgt_range
+
+# Vorhersagen mit dem Modell (Wahrscheinlichkeiten)
+instances_education["prediction"] = best_rf_model.predict_proba(instances_education)[:, 1]
+instances_fnlwgt["prediction"] = best_rf_model.predict_proba(instances_fnlwgt)[:, 1]
+
+# Bestimmen der y-Achsen-Grenzen (min/max für alle Vorhersagen)
+y_min = min(instances_education["prediction"].min(), instances_fnlwgt["prediction"].min())
+y_max = max(instances_education["prediction"].max(), instances_fnlwgt["prediction"].max())
+
+# Visualisierung der Variation von 'education-num' (moderater Einfluss)
+plt.figure(figsize=(8,5))
+plt.plot(education_range, instances_education["prediction"], label="Moderater Einfluss auf die Vorhersage", color='green')
+plt.axvline(instance_df[less_important_feature_education].values[0], color="red", linestyle="dashed", label="Originalwert")
+plt.xlabel("Bildungsniveau (education-num)")
+plt.ylabel("Vorhersage (0 = <=50K, 1 = >50K)")
+plt.title(f"Einfluss von {less_important_feature_education} auf die Vorhersage")
+plt.ylim([y_min, y_max])  # Einheitliche y-Achse
+plt.legend()
+plt.show()
+
+# Visualisierung der Variation von 'fnlwgt' (wenig Einfluss)
+plt.figure(figsize=(8,5))
+plt.plot(fnlwgt_range, instances_fnlwgt["prediction"], label="Wenig Einfluss auf die Vorhersage", color='orange')
+plt.axvline(instance_df[less_important_feature_fnlwgt].values[0], color="red", linestyle="dashed", label="Originalwert")
+plt.xlabel("Finales Gewicht (fnlwgt)")
+plt.ylabel("Vorhersage (0 = <=50K, 1 = >50K)")
+plt.title(f"Einfluss von {less_important_feature_fnlwgt} auf die Vorhersage")
+plt.ylim([y_min, y_max])  # Einheitliche y-Achse
+plt.legend()
+plt.show()
--- a/extracted_cells/cell19.py
+++ b/extracted_cells/cell19.py
@ -0,0 +1,40 @@
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from sklearn.linear_model import LinearRegression
+from sklearn.metrics.pairwise import euclidean_distances
+
+# Anzahl der zu erzeugenden Perturbationen
+num_samples = 500  
+
+# Die Originalinstanz, die wir erklären wollen
+original_instance = instance_df.iloc[0].copy()
+
+# Erstelle perturbierte Instanzen durch zufällige Variation aller Features
+perturbed_instances = pd.DataFrame(
+    np.random.normal(loc=original_instance, scale=1.0, size=(num_samples, len(original_instance))),
+    columns=original_instance.index
+)
+
+# Vorhersagen für die perturbierten Instanzen mit dem Random-Forest-Modell
+perturbed_instances["prediction"] = best_rf_model.predict_proba(perturbed_instances)[:, 1]
+
+# Berechnung der Gewichte nach Distanz zur Originalinstanz
+distances = euclidean_distances(perturbed_instances.drop(columns=["prediction"]), [original_instance])
+kernel_width = np.sqrt(len(original_instance))  # Kernel-Bandbreite
+weights = np.exp(- (distances ** 2) / (2 * (kernel_width ** 2)))
+
+# Gewichtete lokale lineare Regression zum Erklären der Vorhersage
+lin_reg = LinearRegression()
+lin_reg.fit(perturbed_instances.drop(columns=["prediction"]), perturbed_instances["prediction"], sample_weight=weights.flatten())
+
+# Anzeige der Feature-Wichtigkeiten
+feature_importances = pd.Series(lin_reg.coef_, index=original_instance.index).sort_values(key=abs, ascending=False)
+
+# Visualisierung der wichtigsten Features
+plt.figure(figsize=(8, 5))
+feature_importances[:10].plot(kind="barh", color="skyblue")
+plt.xlabel("Einfluss auf die Vorhersage")
+plt.title("Erklärungsmodell (Nachbildung von LIME)")
+plt.gca().invert_yaxis()
+plt.show()
--- a/extracted_cells/cell2.py
+++ b/extracted_cells/cell2.py
@ -0,0 +1,2 @@
+# Informationen über den Datensatz
+print("Datensatzgröße:", df.shape)
--- a/extracted_cells/cell20.py
+++ b/extracted_cells/cell20.py
@ -0,0 +1,25 @@
+def predict_fn(x):
+    # Konvertiere das NumPy-Array zurück in ein DataFrame mit den richtigen Feature-Namen
+    df = pd.DataFrame(x, columns=X_train.columns)
+    return best_rf_model.predict_proba(df)
+
+# Speichere mehrere Erklärungen
+explanations = []
+for i in range(100):
+    exp = lime_explainer.explain_instance(
+        instance, 
+        predict_fn,
+        num_features=20
+    )
+    explanations.append(exp.as_list())
+
+# Berechne die Varianz der Feature-Wichtigkeiten
+feature_variances = {}
+for i in range(len(explanations[0])):
+    feature = explanations[0][i][0]
+    values = [exp[i][1] for exp in explanations if len(exp) > i and exp[i][0] == feature]
+    feature_variances[feature] = np.var(values)
+
+print("Varianz der Feature-Wichtigkeiten:")
+for feature, variance in feature_variances.items():
+    print(f"{feature}: {variance:.6f}")
--- a/extracted_cells/cell21.py
+++ b/extracted_cells/cell21.py
@ -0,0 +1,53 @@
+# Erstelle nur das farbcodierte Stabilitätsdiagramm für die Top-5 Features
+import matplotlib.pyplot as plt
+import numpy as np
+from matplotlib.patches import Patch
+
+# Anzahl der anzuzeigenden Features (Top-N mit höchster Varianz)
+num_features_to_show = 5  # Top-5 Features
+
+# Sortiere die Features nach Varianz (absteigend)
+sorted_features = sorted(feature_variances.items(), key=lambda x: x[1], reverse=True)
+# Beschränke auf die Top-N Features
+sorted_features = sorted_features[:num_features_to_show]
+features = [item[0] for item in sorted_features]
+variances = [item[1] for item in sorted_features]
+
+# Definiere Schwellenwerte für die Farbkodierung
+low_threshold = 0.0001
+medium_threshold = 0.001
+
+# Farbkodierung basierend auf Varianzwerten
+colors = []
+for v in variances:
+    if v < low_threshold:
+        colors.append('green')  # Niedrige Varianz - sehr stabil
+    elif v < medium_threshold:
+        colors.append('orange')  # Mittlere Varianz - mäßig stabil
+    else:
+        colors.append('red')    # Hohe Varianz - instabil
+
+# Erstelle das Balkendiagramm mit Farbkodierung
+plt.figure(figsize=(10, 6))
+bars = plt.barh(features, variances, color=colors)
+
+# Füge Werte am Ende der Balken hinzu
+for i, v in enumerate(variances):
+    plt.text(v + 0.00001, i, f"{v:.6f}", va='center')
+
+# Füge eine Legende hinzu
+legend_elements = [
+    Patch(facecolor='green', label='Sehr stabil (< 0.0001)'),
+    Patch(facecolor='orange', label='Mäßig stabil (< 0.001)'),
+    Patch(facecolor='red', label='Instabil (≥ 0.001)')
+]
+plt.legend(handles=legend_elements, loc='lower right')
+
+plt.xlabel('Varianz')
+plt.title('Stabilität der Top-5 LIME-Features')
+plt.grid(axis='x', linestyle='--', alpha=0.7)
+plt.tight_layout()
+
+# Speichere die Abbildung
+plt.savefig('output/Lime_Varianz_Features.png', dpi=300)
+plt.show()
--- a/extracted_cells/cell22.py
+++ b/extracted_cells/cell22.py
@ -0,0 +1,14 @@
+from sklearn.tree import DecisionTreeClassifier, export_text
+
+# Random Forest-Vorhersagen verwenden als Ziel
+rf_predictions = rf_model.predict(X_train)
+
+# Einfachen Entscheidungsbaum auf die Vorhersagen des Random Forests trainieren
+surrogate_tree = DecisionTreeClassifier(max_depth=5)
+surrogate_tree.fit(X_train, rf_predictions)
+
+# Evaluieren, wie gut der Baum den Random Forest approximiert
+surrogate_predictions = surrogate_tree.predict(X_test)
+rf_test_predictions = rf_model.predict(X_test)
+surrogate_accuracy = np.mean(surrogate_predictions == rf_test_predictions)
+print(f"Genauigkeit des Surrogate-Modells: {surrogate_accuracy:.4f}")
--- a/extracted_cells/cell23.py
+++ b/extracted_cells/cell23.py
@ -0,0 +1,13 @@
+# Formatierte Regeln anzeigen
+def format_rules(tree_rules):
+    """Formatiert die Baumregeln mit menschenlesbaren Klassennamen"""
+    # Ersetze 'class: 0' durch 'Einkommen ≤ 50K'
+    formatted_rules = tree_rules.replace('class: 0', 'Einkommen ≤ 50K')
+    # Ersetze 'class: 1' durch 'Einkommen > 50K'
+    formatted_rules = formatted_rules.replace('class: 1', 'Einkommen > 50K')
+    return formatted_rules
+
+# Regeln aus dem Surrogate-Baum extrahieren
+tree_rules = export_text(surrogate_tree, feature_names=X_train.columns.tolist())
+#print(tree_rules)
+print(format_rules(tree_rules))
--- a/extracted_cells/cell24.py
+++ b/extracted_cells/cell24.py
@ -0,0 +1,69 @@
+from sklearn.tree import _tree
+
+def extract_single_rule(tree, feature_names, class_to_extract=1):
+    """
+    Extrahiert eine einzelne Regel aus einem Decision Tree für eine bestimmte Klasse.
+    
+    Parameters:
+    -----------
+    tree : DecisionTreeClassifier
+        Der trainierte Entscheidungsbaum
+    feature_names : list
+        Liste der Feature-Namen
+    class_to_extract : int, default=1
+        Die Klasse, für die eine Regel extrahiert werden soll (0=≤50K, 1=>50K)
+    
+    Returns:
+    --------
+    rule : str
+        Eine lesbare Regel als String
+    """
+    tree_ = tree.tree_
+    
+    # Funktion zum rekursiven Extrahieren einer Regel
+    def tree_to_rule(node, depth, conditions):
+        # Wenn wir einen Blattknoten erreicht haben
+        if tree_.children_left[node] == _tree.TREE_LEAF:
+            # Prüfe, ob dieser Blattknoten die gewünschte Klasse vorhersagt
+            if np.argmax(tree_.value[node][0]) == class_to_extract:
+                # Formatiere die Bedingungen als Regel
+                if conditions:
+                    rule = " UND ".join(conditions)
+                    return rule
+                else:
+                    return "Keine Bedingungen (Wurzelklasse)"
+            return None
+            
+        # Feature und Schwellenwert am aktuellen Knoten
+        feature = feature_names[tree_.feature[node]]
+        threshold = tree_.threshold[node]
+        
+        # Linkspfad (≤)
+        left_conditions = conditions + [f"{feature} ≤ {threshold:.2f}"]
+        left_rule = tree_to_rule(tree_.children_left[node], depth + 1, left_conditions)
+        if left_rule is not None:
+            return left_rule
+            
+        # Rechtspfad (>)
+        right_conditions = conditions + [f"{feature} > {threshold:.2f}"]
+        right_rule = tree_to_rule(tree_.children_right[node], depth + 1, right_conditions)
+        if right_rule is not None:
+            return right_rule
+            
+        # Keine passende Regel gefunden
+        return None
+    
+    # Starte die Suche vom Wurzelknoten
+    rule = tree_to_rule(0, 1, [])
+    
+    # Formatiere die Ausgabe
+    class_name = "Einkommen > 50K" if class_to_extract == 1 else "Einkommen ≤ 50K"
+    if rule:
+        return f"WENN {rule} DANN {class_name}"
+    else:
+        return f"Keine Regel für {class_name} gefunden."
+
+# Anwendung für die Extraktion einer Regel für hohes Einkommen (Klasse 1)
+single_rule = extract_single_rule(surrogate_tree, X_train.columns.tolist(), class_to_extract=1)
+print("Einzelne Regel aus dem Surrogate-Modell:")
+print(single_rule)
--- a/extracted_cells/cell25.py
+++ b/extracted_cells/cell25.py
@ -0,0 +1,67 @@
+import matplotlib.pyplot as plt
+import numpy as np
+
+def plot_surrogate_accuracy_vs_depth_test_only(rf_model, X_train, X_test, max_depths=range(1, 16)):
+    """
+    Visualisiert die Genauigkeit des Surrogate-Modells für verschiedene Baumtiefen,
+    fokussiert nur auf die Testdaten.
+    """
+    # Random Forest-Vorhersagen (nur einmal berechnen)
+    rf_train_predictions = rf_model.predict(X_train)
+    rf_test_predictions = rf_model.predict(X_test)
+    
+    # Ergebnisse für verschiedene Baumtiefen
+    test_accuracies = []
+    
+    for depth in max_depths:
+        # Surrogate-Baum mit aktueller Tiefe trainieren
+        surrogate_tree = DecisionTreeClassifier(max_depth=depth, random_state=42)
+        surrogate_tree.fit(X_train, rf_train_predictions)
+        
+        # Vorhersagen
+        surrogate_test_pred = surrogate_tree.predict(X_test)
+        
+        # Genauigkeit berechnen
+        test_acc = np.mean(surrogate_test_pred == rf_test_predictions)
+        test_accuracies.append(test_acc)
+    
+    # Visualisierung
+    plt.figure(figsize=(10, 6))
+    plt.plot(max_depths, test_accuracies, 'o-', color='#ED7D31', linewidth=2)
+    
+    # Finde die beste Tiefe
+    best_depth = max_depths[np.argmax(test_accuracies)]
+    best_acc = max(test_accuracies)
+    
+    # Markiere den besten Punkt
+    plt.scatter([best_depth], [best_acc], s=100, c='red', zorder=5)
+    plt.annotate(f'Optimale Tiefe: {best_depth}\nGenauigkeit: {best_acc:.4f}',
+                xy=(best_depth, best_acc), xytext=(best_depth+1, best_acc-0.05),
+                arrowprops=dict(facecolor='black', shrink=0.05, width=1.5))
+    
+    # Beschriftungen und Layout
+    plt.grid(alpha=0.3)
+    plt.title('Surrogate-Modell-Genauigkeit bei verschiedenen Baumtiefen', fontsize=14)
+    plt.xlabel('Maximale Baumtiefe', fontsize=12)
+    plt.ylabel('Genauigkeit auf Testdaten', fontsize=12)
+    
+    # Füge Werte über den Punkten hinzu
+    for i, acc in enumerate(test_accuracies):
+        plt.text(max_depths[i], acc + 0.01, f'{acc:.3f}', ha='center')
+    
+    # Y-Achse anpassen (je nach Daten)
+    y_min = max(0, min(test_accuracies) - 0.05)
+    plt.ylim(y_min, 1.05)
+    
+    # Verbesserte visuelle Elemente
+    plt.fill_between(max_depths, test_accuracies, y_min, alpha=0.1, color='#ED7D31')
+    
+    plt.tight_layout()
+    plt.savefig('output/surrogate_accuracy.png', dpi=300)
+    plt.show()
+    
+    return best_depth, best_acc
+
+# Aufruf der Funktion
+best_depth, best_accuracy = plot_surrogate_accuracy_vs_depth_test_only(rf_model, X_train, X_test)
+print(f"Optimale Baumtiefe: {best_depth} mit einer Genauigkeit von {best_accuracy:.4f}")
--- a/extracted_cells/cell26.py
+++ b/extracted_cells/cell26.py
@ -0,0 +1,3 @@
+# Für Quarto Präsentationen
+import extract_cells
+extract_cells.extractToFiles("Explainable_AI_Adult_Census_Income.ipynb")
--- a/extracted_cells/cell3.py
+++ b/extracted_cells/cell3.py
@ -0,0 +1,3 @@
+print("\nDatentypen:")
+#print(df.dtypes)
+df.info()
--- a/extracted_cells/cell4.py
+++ b/extracted_cells/cell4.py
@ -0,0 +1,2 @@
+print("\nFehlende Werte:")
+df.isnull().sum()
--- a/extracted_cells/cell5.py
+++ b/extracted_cells/cell5.py
@ -0,0 +1,2 @@
+print("\nStatistische Zusammenfassung:")
+df.describe()
--- a/extracted_cells/cell6.py
+++ b/extracted_cells/cell6.py
@ -0,0 +1,13 @@
+# Überprüfen der Verteilung der Zielklasse
+plt.figure(figsize=(8, 6))
+sns.countplot(x='income', data=df)
+plt.title('Verteilung der Einkommensklassen')
+plt.xlabel('Einkommen')
+plt.ylabel('Anzahl')
+plt.savefig('output/Verteilung_Einkommensklassen.png', dpi=300)
+plt.show()
+
+# Prozentuale Verteilung berechnen
+income_counts = df['income'].value_counts(normalize=True) * 100
+print("Prozentuale Verteilung der Einkommensklassen:")
+print(income_counts)
--- a/extracted_cells/cell7.py
+++ b/extracted_cells/cell7.py
@ -0,0 +1,5 @@
+# Überprüfen auf fehlende Werte oder '?'
+for col in df.columns:
+    missing_count = df[df[col] == '?'].shape[0]
+    if missing_count > 0:
+        print(f"Spalte '{col}' hat {missing_count} Einträge mit '?'")
--- a/extracted_cells/cell8.py
+++ b/extracted_cells/cell8.py
@ -0,0 +1,13 @@
+# Ersetzen von '?' durch NaN und dann durch den häufigsten Wert
+df_clean = df.copy()
+
+for col in df_clean.columns:
+    if df_clean[col].dtype == 'object':
+        # Ersetze '?' durch NaN
+        df_clean[col] = df_clean[col].replace('?', np.nan)
+        
+        # Ersetze NaN durch den häufigsten Wert
+        most_frequent = df_clean[col].mode()[0]
+        df_clean[col] = df_clean[col].fillna(most_frequent)
+
+df_clean.head(10)
--- a/extracted_cells/cell9.py
+++ b/extracted_cells/cell9.py
@ -0,0 +1,20 @@
+# Kategorische Variablen in numerische umwandeln
+categorical_cols = df_clean.select_dtypes(include=['object']).columns
+print("Kategorische Spalten:", categorical_cols.tolist())
+
+# Label Encoding für die Zielvariable
+label_encoder = LabelEncoder()
+df_clean['income_encoded'] = label_encoder.fit_transform(df_clean['income'])
+print("\nLabel Encoding für 'income':")
+for i, label in enumerate(label_encoder.classes_):
+    print(f"{label} -> {i}")
+
+# One-Hot Encoding für kategorische Variablen (außer der Zielvariable)
+categorical_cols = categorical_cols.drop('income')
+df_encoded = pd.get_dummies(df_clean, columns=categorical_cols, drop_first=False)
+
+
+print("\nNeue Spalten durch One-Hot Encoding:")
+print(df_encoded.columns[:10].tolist())
+
+print("\nDatensatz nach Vorverarbeitung:", df_encoded.shape)
				`@ -0,0 +1 @@`
				`%pip install scikit-learn matplotlib seaborn pandas numpy lime`