intial commit (forked from private repo)
This commit is contained in:
1
extracted_cells/cell0.py
Normal file
1
extracted_cells/cell0.py
Normal file
@ -0,0 +1 @@
|
||||
%pip install scikit-learn matplotlib seaborn pandas numpy lime
|
||||
15
extracted_cells/cell1.py
Normal file
15
extracted_cells/cell1.py
Normal file
@ -0,0 +1,15 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
from sklearn.model_selection import train_test_split, GridSearchCV
|
||||
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
from sklearn.tree import plot_tree
|
||||
|
||||
# Daten einlesen
|
||||
df = pd.read_csv("./sample_data/adult_census_income/adult.csv")
|
||||
|
||||
# Anzeigen der ersten Zeilen des Datensatzes
|
||||
df.head(10)
|
||||
11
extracted_cells/cell10.py
Normal file
11
extracted_cells/cell10.py
Normal file
@ -0,0 +1,11 @@
|
||||
# Features und Zielwerte definieren
|
||||
X = df_encoded.drop(['income', 'income_encoded'], axis=1)
|
||||
y = df_encoded['income_encoded']
|
||||
|
||||
|
||||
# Daten in Trainings- und Testdaten aufteilen
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
|
||||
|
||||
print(f"Trainingsdaten: {X_train.shape[0]} Beispiele")
|
||||
print(f"Testdaten: {X_test.shape[0]} Beispiele")
|
||||
print(f"Features: {X_train.shape[1]} Merkmale")
|
||||
17
extracted_cells/cell11.py
Normal file
17
extracted_cells/cell11.py
Normal file
@ -0,0 +1,17 @@
|
||||
# Random Forest Modell trainieren
|
||||
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
|
||||
rf_model.fit(X_train, y_train)
|
||||
|
||||
# Vorhersagen für die Testdaten
|
||||
y_pred = rf_model.predict(X_test)
|
||||
|
||||
# Modellleistung evaluieren
|
||||
accuracy = accuracy_score(y_test, y_pred)
|
||||
precision = precision_score(y_test, y_pred)
|
||||
recall = recall_score(y_test, y_pred)
|
||||
f1 = f1_score(y_test, y_pred)
|
||||
|
||||
print(f"Accuracy: {accuracy:.4f}")
|
||||
print(f"Precision: {precision:.4f}")
|
||||
print(f"Recall: {recall:.4f}")
|
||||
print(f"F1 Score: {f1:.4f}")
|
||||
16
extracted_cells/cell12.py
Normal file
16
extracted_cells/cell12.py
Normal file
@ -0,0 +1,16 @@
|
||||
|
||||
# Confusion Matrix
|
||||
cm = confusion_matrix(y_test, y_pred)
|
||||
plt.figure(figsize=(8, 6))
|
||||
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
|
||||
xticklabels=['<=50K', '>50K'],
|
||||
yticklabels=['<=50K', '>50K'])
|
||||
plt.xlabel('Vorhergesagt')
|
||||
plt.ylabel('Tatsächlich')
|
||||
plt.title('Confusion Matrix')
|
||||
plt.savefig('output/Confusions_Matrix.png', dpi=300)
|
||||
plt.show()
|
||||
|
||||
# Klassifikationsbericht
|
||||
print("\nKlassifikationsbericht:")
|
||||
print(classification_report(y_test, y_pred, target_names=['<=50K', '>50K']))
|
||||
12
extracted_cells/cell13.py
Normal file
12
extracted_cells/cell13.py
Normal file
@ -0,0 +1,12 @@
|
||||
# Visualisierung eines einzelnen Entscheidungsbaums aus dem Random Forest
|
||||
plt.figure(figsize=(25, 12))
|
||||
tree_to_plot = rf_model.estimators_[0] # Ersten Baum aus dem Forest auswählen
|
||||
plot_tree(tree_to_plot,
|
||||
feature_names=X_train.columns,
|
||||
class_names=['<=50K', '>50K'],
|
||||
filled=True,
|
||||
rounded=True,
|
||||
fontsize=10,
|
||||
max_depth=3)
|
||||
plt.savefig('output/Random_Forest_Tree_Example.png', dpi=300)
|
||||
plt.show()
|
||||
12
extracted_cells/cell14.py
Normal file
12
extracted_cells/cell14.py
Normal file
@ -0,0 +1,12 @@
|
||||
# Feature Importance
|
||||
feature_importance = pd.DataFrame({
|
||||
'Feature': X_train.columns,
|
||||
'Importance': rf_model.feature_importances_
|
||||
}).sort_values('Importance', ascending=False)
|
||||
|
||||
plt.figure(figsize=(12, 8))
|
||||
sns.barplot(x='Importance', y='Feature', data=feature_importance.head(15))
|
||||
plt.title('Feature Importance')
|
||||
plt.tight_layout()
|
||||
plt.savefig('output/Feature_Importance.png', dpi=300)
|
||||
plt.show()
|
||||
33
extracted_cells/cell15.py
Normal file
33
extracted_cells/cell15.py
Normal file
@ -0,0 +1,33 @@
|
||||
# Hyperparameter-Grid definieren
|
||||
param_grid = {
|
||||
'n_estimators': [50, 100],
|
||||
'max_depth': [None, 10, 20],
|
||||
'min_samples_split': [2, 5],
|
||||
'min_samples_leaf': [1, 2]
|
||||
}
|
||||
|
||||
# GridSearchCV
|
||||
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, scoring='accuracy')
|
||||
grid_search.fit(X_train, y_train)
|
||||
|
||||
# Beste Parameter
|
||||
print("Beste Parameter:")
|
||||
print(grid_search.best_params_)
|
||||
|
||||
# Bestes Modell
|
||||
best_rf_model = grid_search.best_estimator_
|
||||
|
||||
# Vorhersagen mit dem besten Modell
|
||||
y_pred_best = best_rf_model.predict(X_test)
|
||||
|
||||
# Modellleistung evaluieren
|
||||
accuracy_best = accuracy_score(y_test, y_pred_best)
|
||||
precision_best = precision_score(y_test, y_pred_best)
|
||||
recall_best = recall_score(y_test, y_pred_best)
|
||||
f1_best = f1_score(y_test, y_pred_best)
|
||||
|
||||
print(f"\nBestes Modell:")
|
||||
print(f"Accuracy: {accuracy_best:.4f}")
|
||||
print(f"Precision: {precision_best:.4f}")
|
||||
print(f"Recall: {recall_best:.4f}")
|
||||
print(f"F1 Score: {f1_best:.4f}")
|
||||
17
extracted_cells/cell16.py
Normal file
17
extracted_cells/cell16.py
Normal file
@ -0,0 +1,17 @@
|
||||
# LIME für Erklärbarkeit
|
||||
from lime import lime_tabular
|
||||
import random
|
||||
|
||||
# Erstelle einen LIME-Erklärer
|
||||
lime_explainer = lime_tabular.LimeTabularExplainer(
|
||||
X_train.values,
|
||||
feature_names=X_train.columns,
|
||||
class_names=['<=50K', '>50K'],
|
||||
mode='classification',
|
||||
random_state=42
|
||||
)
|
||||
|
||||
# Wähle ein zufälliges Beispiel aus den Testdaten
|
||||
random_idx = random.randint(0, len(X_test) - 1)
|
||||
instance_df = X_test.iloc[random_idx:random_idx+1]
|
||||
instance = instance_df.values[0] # Für LIME benötigen wir das Array
|
||||
65
extracted_cells/cell17.py
Normal file
65
extracted_cells/cell17.py
Normal file
@ -0,0 +1,65 @@
|
||||
# Erkläre die Vorhersage mit LIME
|
||||
|
||||
def analyze_lime_feature_importance(instance, rf_model, lime_explainer, num_features=5):
|
||||
"""
|
||||
Analysiert die Feature-Wichtigkeiten der LIME-Erklärung.
|
||||
"""
|
||||
# LIME-Erklärung generieren
|
||||
exp = lime_explainer.explain_instance(
|
||||
instance,
|
||||
rf_model.predict_proba,
|
||||
num_features=num_features
|
||||
)
|
||||
|
||||
# Random Forest Vorhersage (nur zur Information)
|
||||
feature_names = rf_model.feature_names_in_
|
||||
instance_df = pd.DataFrame([instance], columns=feature_names)
|
||||
rf_prediction = rf_model.predict_proba(instance_df)[0, 1]
|
||||
|
||||
# Feature-Wichtigkeiten aus LIME extrahieren
|
||||
feature_importance = exp.as_list()
|
||||
|
||||
# Visualisierung der Feature-Wichtigkeiten
|
||||
plt.figure(figsize=(10, 6))
|
||||
|
||||
# Features und ihre Wichtigkeiten trennen
|
||||
features, importances = zip(*feature_importance)
|
||||
|
||||
# Balkendiagramm erstellen
|
||||
colors = ['red' if imp < 0 else 'green' for imp in importances]
|
||||
y_pos = np.arange(len(features))
|
||||
|
||||
plt.barh(y_pos, importances, color=colors)
|
||||
plt.yticks(y_pos, features)
|
||||
|
||||
plt.xlabel('Feature-Einfluss')
|
||||
plt.title('LIME Feature-Wichtigkeiten')
|
||||
|
||||
# Vertikale Linie bei 0 für bessere Lesbarkeit
|
||||
plt.axvline(x=0, color='black', linestyle='-', alpha=0.3)
|
||||
|
||||
plt.grid(alpha=0.3)
|
||||
plt.tight_layout()
|
||||
plt.savefig('output/lime_feature_importance.png', dpi=300)
|
||||
plt.show()
|
||||
|
||||
# Ausgabe der Ergebnisse
|
||||
print("\nLIME Feature-Wichtigkeiten Analyse:")
|
||||
print("-" * 50)
|
||||
print(f"Random Forest Vorhersage für diese Instanz: {rf_prediction:.4f}")
|
||||
print("\nFeature-Einflüsse:")
|
||||
for feature, importance in feature_importance:
|
||||
print(f"{feature}: {importance:+.4f}")
|
||||
|
||||
return {
|
||||
'rf_prediction': rf_prediction,
|
||||
'feature_importance': dict(feature_importance)
|
||||
}
|
||||
|
||||
# Analysiere wie gut LIME die RF-Vorhersage erklärt
|
||||
importance_analysis = analyze_lime_feature_importance(
|
||||
instance=instance,
|
||||
rf_model=best_rf_model,
|
||||
lime_explainer=lime_explainer,
|
||||
num_features=20
|
||||
)
|
||||
49
extracted_cells/cell18.py
Normal file
49
extracted_cells/cell18.py
Normal file
@ -0,0 +1,49 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
# Wählen wir zwei Features zur Variation:
|
||||
less_important_feature_education = "education.num"
|
||||
less_important_feature_fnlwgt = "fnlwgt"
|
||||
|
||||
# Festlegen derRange für die Variation der zwei Features
|
||||
education_range = np.linspace(instance_df[less_important_feature_education].values[0] - 10, instance_df[less_important_feature_education].values[0] + 10, 50)
|
||||
fnlwgt_range = np.linspace(instance_df[less_important_feature_fnlwgt].values[0] - 100000, instance_df[less_important_feature_fnlwgt].values[0] + 100000, 50)
|
||||
|
||||
# Erstellen von Instanzen für LIME
|
||||
instances_education = pd.DataFrame([instance] * len(education_range), columns=X_train.columns)
|
||||
instances_fnlwgt = pd.DataFrame([instance] * len(fnlwgt_range), columns=X_train.columns)
|
||||
|
||||
# Ändern der Feature-Werte in den Instanzen
|
||||
instances_education[less_important_feature_education] = education_range
|
||||
instances_fnlwgt[less_important_feature_fnlwgt] = fnlwgt_range
|
||||
|
||||
# Vorhersagen mit dem Modell (Wahrscheinlichkeiten)
|
||||
instances_education["prediction"] = best_rf_model.predict_proba(instances_education)[:, 1]
|
||||
instances_fnlwgt["prediction"] = best_rf_model.predict_proba(instances_fnlwgt)[:, 1]
|
||||
|
||||
# Bestimmen der y-Achsen-Grenzen (min/max für alle Vorhersagen)
|
||||
y_min = min(instances_education["prediction"].min(), instances_fnlwgt["prediction"].min())
|
||||
y_max = max(instances_education["prediction"].max(), instances_fnlwgt["prediction"].max())
|
||||
|
||||
# Visualisierung der Variation von 'education-num' (moderater Einfluss)
|
||||
plt.figure(figsize=(8,5))
|
||||
plt.plot(education_range, instances_education["prediction"], label="Moderater Einfluss auf die Vorhersage", color='green')
|
||||
plt.axvline(instance_df[less_important_feature_education].values[0], color="red", linestyle="dashed", label="Originalwert")
|
||||
plt.xlabel("Bildungsniveau (education-num)")
|
||||
plt.ylabel("Vorhersage (0 = <=50K, 1 = >50K)")
|
||||
plt.title(f"Einfluss von {less_important_feature_education} auf die Vorhersage")
|
||||
plt.ylim([y_min, y_max]) # Einheitliche y-Achse
|
||||
plt.legend()
|
||||
plt.show()
|
||||
|
||||
# Visualisierung der Variation von 'fnlwgt' (wenig Einfluss)
|
||||
plt.figure(figsize=(8,5))
|
||||
plt.plot(fnlwgt_range, instances_fnlwgt["prediction"], label="Wenig Einfluss auf die Vorhersage", color='orange')
|
||||
plt.axvline(instance_df[less_important_feature_fnlwgt].values[0], color="red", linestyle="dashed", label="Originalwert")
|
||||
plt.xlabel("Finales Gewicht (fnlwgt)")
|
||||
plt.ylabel("Vorhersage (0 = <=50K, 1 = >50K)")
|
||||
plt.title(f"Einfluss von {less_important_feature_fnlwgt} auf die Vorhersage")
|
||||
plt.ylim([y_min, y_max]) # Einheitliche y-Achse
|
||||
plt.legend()
|
||||
plt.show()
|
||||
40
extracted_cells/cell19.py
Normal file
40
extracted_cells/cell19.py
Normal file
@ -0,0 +1,40 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
from sklearn.linear_model import LinearRegression
|
||||
from sklearn.metrics.pairwise import euclidean_distances
|
||||
|
||||
# Anzahl der zu erzeugenden Perturbationen
|
||||
num_samples = 500
|
||||
|
||||
# Die Originalinstanz, die wir erklären wollen
|
||||
original_instance = instance_df.iloc[0].copy()
|
||||
|
||||
# Erstelle perturbierte Instanzen durch zufällige Variation aller Features
|
||||
perturbed_instances = pd.DataFrame(
|
||||
np.random.normal(loc=original_instance, scale=1.0, size=(num_samples, len(original_instance))),
|
||||
columns=original_instance.index
|
||||
)
|
||||
|
||||
# Vorhersagen für die perturbierten Instanzen mit dem Random-Forest-Modell
|
||||
perturbed_instances["prediction"] = best_rf_model.predict_proba(perturbed_instances)[:, 1]
|
||||
|
||||
# Berechnung der Gewichte nach Distanz zur Originalinstanz
|
||||
distances = euclidean_distances(perturbed_instances.drop(columns=["prediction"]), [original_instance])
|
||||
kernel_width = np.sqrt(len(original_instance)) # Kernel-Bandbreite
|
||||
weights = np.exp(- (distances ** 2) / (2 * (kernel_width ** 2)))
|
||||
|
||||
# Gewichtete lokale lineare Regression zum Erklären der Vorhersage
|
||||
lin_reg = LinearRegression()
|
||||
lin_reg.fit(perturbed_instances.drop(columns=["prediction"]), perturbed_instances["prediction"], sample_weight=weights.flatten())
|
||||
|
||||
# Anzeige der Feature-Wichtigkeiten
|
||||
feature_importances = pd.Series(lin_reg.coef_, index=original_instance.index).sort_values(key=abs, ascending=False)
|
||||
|
||||
# Visualisierung der wichtigsten Features
|
||||
plt.figure(figsize=(8, 5))
|
||||
feature_importances[:10].plot(kind="barh", color="skyblue")
|
||||
plt.xlabel("Einfluss auf die Vorhersage")
|
||||
plt.title("Erklärungsmodell (Nachbildung von LIME)")
|
||||
plt.gca().invert_yaxis()
|
||||
plt.show()
|
||||
2
extracted_cells/cell2.py
Normal file
2
extracted_cells/cell2.py
Normal file
@ -0,0 +1,2 @@
|
||||
# Informationen über den Datensatz
|
||||
print("Datensatzgröße:", df.shape)
|
||||
25
extracted_cells/cell20.py
Normal file
25
extracted_cells/cell20.py
Normal file
@ -0,0 +1,25 @@
|
||||
def predict_fn(x):
|
||||
# Konvertiere das NumPy-Array zurück in ein DataFrame mit den richtigen Feature-Namen
|
||||
df = pd.DataFrame(x, columns=X_train.columns)
|
||||
return best_rf_model.predict_proba(df)
|
||||
|
||||
# Speichere mehrere Erklärungen
|
||||
explanations = []
|
||||
for i in range(100):
|
||||
exp = lime_explainer.explain_instance(
|
||||
instance,
|
||||
predict_fn,
|
||||
num_features=20
|
||||
)
|
||||
explanations.append(exp.as_list())
|
||||
|
||||
# Berechne die Varianz der Feature-Wichtigkeiten
|
||||
feature_variances = {}
|
||||
for i in range(len(explanations[0])):
|
||||
feature = explanations[0][i][0]
|
||||
values = [exp[i][1] for exp in explanations if len(exp) > i and exp[i][0] == feature]
|
||||
feature_variances[feature] = np.var(values)
|
||||
|
||||
print("Varianz der Feature-Wichtigkeiten:")
|
||||
for feature, variance in feature_variances.items():
|
||||
print(f"{feature}: {variance:.6f}")
|
||||
53
extracted_cells/cell21.py
Normal file
53
extracted_cells/cell21.py
Normal file
@ -0,0 +1,53 @@
|
||||
# Erstelle nur das farbcodierte Stabilitätsdiagramm für die Top-5 Features
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
from matplotlib.patches import Patch
|
||||
|
||||
# Anzahl der anzuzeigenden Features (Top-N mit höchster Varianz)
|
||||
num_features_to_show = 5 # Top-5 Features
|
||||
|
||||
# Sortiere die Features nach Varianz (absteigend)
|
||||
sorted_features = sorted(feature_variances.items(), key=lambda x: x[1], reverse=True)
|
||||
# Beschränke auf die Top-N Features
|
||||
sorted_features = sorted_features[:num_features_to_show]
|
||||
features = [item[0] for item in sorted_features]
|
||||
variances = [item[1] for item in sorted_features]
|
||||
|
||||
# Definiere Schwellenwerte für die Farbkodierung
|
||||
low_threshold = 0.0001
|
||||
medium_threshold = 0.001
|
||||
|
||||
# Farbkodierung basierend auf Varianzwerten
|
||||
colors = []
|
||||
for v in variances:
|
||||
if v < low_threshold:
|
||||
colors.append('green') # Niedrige Varianz - sehr stabil
|
||||
elif v < medium_threshold:
|
||||
colors.append('orange') # Mittlere Varianz - mäßig stabil
|
||||
else:
|
||||
colors.append('red') # Hohe Varianz - instabil
|
||||
|
||||
# Erstelle das Balkendiagramm mit Farbkodierung
|
||||
plt.figure(figsize=(10, 6))
|
||||
bars = plt.barh(features, variances, color=colors)
|
||||
|
||||
# Füge Werte am Ende der Balken hinzu
|
||||
for i, v in enumerate(variances):
|
||||
plt.text(v + 0.00001, i, f"{v:.6f}", va='center')
|
||||
|
||||
# Füge eine Legende hinzu
|
||||
legend_elements = [
|
||||
Patch(facecolor='green', label='Sehr stabil (< 0.0001)'),
|
||||
Patch(facecolor='orange', label='Mäßig stabil (< 0.001)'),
|
||||
Patch(facecolor='red', label='Instabil (≥ 0.001)')
|
||||
]
|
||||
plt.legend(handles=legend_elements, loc='lower right')
|
||||
|
||||
plt.xlabel('Varianz')
|
||||
plt.title('Stabilität der Top-5 LIME-Features')
|
||||
plt.grid(axis='x', linestyle='--', alpha=0.7)
|
||||
plt.tight_layout()
|
||||
|
||||
# Speichere die Abbildung
|
||||
plt.savefig('output/Lime_Varianz_Features.png', dpi=300)
|
||||
plt.show()
|
||||
14
extracted_cells/cell22.py
Normal file
14
extracted_cells/cell22.py
Normal file
@ -0,0 +1,14 @@
|
||||
from sklearn.tree import DecisionTreeClassifier, export_text
|
||||
|
||||
# Random Forest-Vorhersagen verwenden als Ziel
|
||||
rf_predictions = rf_model.predict(X_train)
|
||||
|
||||
# Einfachen Entscheidungsbaum auf die Vorhersagen des Random Forests trainieren
|
||||
surrogate_tree = DecisionTreeClassifier(max_depth=5)
|
||||
surrogate_tree.fit(X_train, rf_predictions)
|
||||
|
||||
# Evaluieren, wie gut der Baum den Random Forest approximiert
|
||||
surrogate_predictions = surrogate_tree.predict(X_test)
|
||||
rf_test_predictions = rf_model.predict(X_test)
|
||||
surrogate_accuracy = np.mean(surrogate_predictions == rf_test_predictions)
|
||||
print(f"Genauigkeit des Surrogate-Modells: {surrogate_accuracy:.4f}")
|
||||
13
extracted_cells/cell23.py
Normal file
13
extracted_cells/cell23.py
Normal file
@ -0,0 +1,13 @@
|
||||
# Formatierte Regeln anzeigen
|
||||
def format_rules(tree_rules):
|
||||
"""Formatiert die Baumregeln mit menschenlesbaren Klassennamen"""
|
||||
# Ersetze 'class: 0' durch 'Einkommen ≤ 50K'
|
||||
formatted_rules = tree_rules.replace('class: 0', 'Einkommen ≤ 50K')
|
||||
# Ersetze 'class: 1' durch 'Einkommen > 50K'
|
||||
formatted_rules = formatted_rules.replace('class: 1', 'Einkommen > 50K')
|
||||
return formatted_rules
|
||||
|
||||
# Regeln aus dem Surrogate-Baum extrahieren
|
||||
tree_rules = export_text(surrogate_tree, feature_names=X_train.columns.tolist())
|
||||
#print(tree_rules)
|
||||
print(format_rules(tree_rules))
|
||||
69
extracted_cells/cell24.py
Normal file
69
extracted_cells/cell24.py
Normal file
@ -0,0 +1,69 @@
|
||||
from sklearn.tree import _tree
|
||||
|
||||
def extract_single_rule(tree, feature_names, class_to_extract=1):
|
||||
"""
|
||||
Extrahiert eine einzelne Regel aus einem Decision Tree für eine bestimmte Klasse.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
tree : DecisionTreeClassifier
|
||||
Der trainierte Entscheidungsbaum
|
||||
feature_names : list
|
||||
Liste der Feature-Namen
|
||||
class_to_extract : int, default=1
|
||||
Die Klasse, für die eine Regel extrahiert werden soll (0=≤50K, 1=>50K)
|
||||
|
||||
Returns:
|
||||
--------
|
||||
rule : str
|
||||
Eine lesbare Regel als String
|
||||
"""
|
||||
tree_ = tree.tree_
|
||||
|
||||
# Funktion zum rekursiven Extrahieren einer Regel
|
||||
def tree_to_rule(node, depth, conditions):
|
||||
# Wenn wir einen Blattknoten erreicht haben
|
||||
if tree_.children_left[node] == _tree.TREE_LEAF:
|
||||
# Prüfe, ob dieser Blattknoten die gewünschte Klasse vorhersagt
|
||||
if np.argmax(tree_.value[node][0]) == class_to_extract:
|
||||
# Formatiere die Bedingungen als Regel
|
||||
if conditions:
|
||||
rule = " UND ".join(conditions)
|
||||
return rule
|
||||
else:
|
||||
return "Keine Bedingungen (Wurzelklasse)"
|
||||
return None
|
||||
|
||||
# Feature und Schwellenwert am aktuellen Knoten
|
||||
feature = feature_names[tree_.feature[node]]
|
||||
threshold = tree_.threshold[node]
|
||||
|
||||
# Linkspfad (≤)
|
||||
left_conditions = conditions + [f"{feature} ≤ {threshold:.2f}"]
|
||||
left_rule = tree_to_rule(tree_.children_left[node], depth + 1, left_conditions)
|
||||
if left_rule is not None:
|
||||
return left_rule
|
||||
|
||||
# Rechtspfad (>)
|
||||
right_conditions = conditions + [f"{feature} > {threshold:.2f}"]
|
||||
right_rule = tree_to_rule(tree_.children_right[node], depth + 1, right_conditions)
|
||||
if right_rule is not None:
|
||||
return right_rule
|
||||
|
||||
# Keine passende Regel gefunden
|
||||
return None
|
||||
|
||||
# Starte die Suche vom Wurzelknoten
|
||||
rule = tree_to_rule(0, 1, [])
|
||||
|
||||
# Formatiere die Ausgabe
|
||||
class_name = "Einkommen > 50K" if class_to_extract == 1 else "Einkommen ≤ 50K"
|
||||
if rule:
|
||||
return f"WENN {rule} DANN {class_name}"
|
||||
else:
|
||||
return f"Keine Regel für {class_name} gefunden."
|
||||
|
||||
# Anwendung für die Extraktion einer Regel für hohes Einkommen (Klasse 1)
|
||||
single_rule = extract_single_rule(surrogate_tree, X_train.columns.tolist(), class_to_extract=1)
|
||||
print("Einzelne Regel aus dem Surrogate-Modell:")
|
||||
print(single_rule)
|
||||
67
extracted_cells/cell25.py
Normal file
67
extracted_cells/cell25.py
Normal file
@ -0,0 +1,67 @@
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
|
||||
def plot_surrogate_accuracy_vs_depth_test_only(rf_model, X_train, X_test, max_depths=range(1, 16)):
|
||||
"""
|
||||
Visualisiert die Genauigkeit des Surrogate-Modells für verschiedene Baumtiefen,
|
||||
fokussiert nur auf die Testdaten.
|
||||
"""
|
||||
# Random Forest-Vorhersagen (nur einmal berechnen)
|
||||
rf_train_predictions = rf_model.predict(X_train)
|
||||
rf_test_predictions = rf_model.predict(X_test)
|
||||
|
||||
# Ergebnisse für verschiedene Baumtiefen
|
||||
test_accuracies = []
|
||||
|
||||
for depth in max_depths:
|
||||
# Surrogate-Baum mit aktueller Tiefe trainieren
|
||||
surrogate_tree = DecisionTreeClassifier(max_depth=depth, random_state=42)
|
||||
surrogate_tree.fit(X_train, rf_train_predictions)
|
||||
|
||||
# Vorhersagen
|
||||
surrogate_test_pred = surrogate_tree.predict(X_test)
|
||||
|
||||
# Genauigkeit berechnen
|
||||
test_acc = np.mean(surrogate_test_pred == rf_test_predictions)
|
||||
test_accuracies.append(test_acc)
|
||||
|
||||
# Visualisierung
|
||||
plt.figure(figsize=(10, 6))
|
||||
plt.plot(max_depths, test_accuracies, 'o-', color='#ED7D31', linewidth=2)
|
||||
|
||||
# Finde die beste Tiefe
|
||||
best_depth = max_depths[np.argmax(test_accuracies)]
|
||||
best_acc = max(test_accuracies)
|
||||
|
||||
# Markiere den besten Punkt
|
||||
plt.scatter([best_depth], [best_acc], s=100, c='red', zorder=5)
|
||||
plt.annotate(f'Optimale Tiefe: {best_depth}\nGenauigkeit: {best_acc:.4f}',
|
||||
xy=(best_depth, best_acc), xytext=(best_depth+1, best_acc-0.05),
|
||||
arrowprops=dict(facecolor='black', shrink=0.05, width=1.5))
|
||||
|
||||
# Beschriftungen und Layout
|
||||
plt.grid(alpha=0.3)
|
||||
plt.title('Surrogate-Modell-Genauigkeit bei verschiedenen Baumtiefen', fontsize=14)
|
||||
plt.xlabel('Maximale Baumtiefe', fontsize=12)
|
||||
plt.ylabel('Genauigkeit auf Testdaten', fontsize=12)
|
||||
|
||||
# Füge Werte über den Punkten hinzu
|
||||
for i, acc in enumerate(test_accuracies):
|
||||
plt.text(max_depths[i], acc + 0.01, f'{acc:.3f}', ha='center')
|
||||
|
||||
# Y-Achse anpassen (je nach Daten)
|
||||
y_min = max(0, min(test_accuracies) - 0.05)
|
||||
plt.ylim(y_min, 1.05)
|
||||
|
||||
# Verbesserte visuelle Elemente
|
||||
plt.fill_between(max_depths, test_accuracies, y_min, alpha=0.1, color='#ED7D31')
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig('output/surrogate_accuracy.png', dpi=300)
|
||||
plt.show()
|
||||
|
||||
return best_depth, best_acc
|
||||
|
||||
# Aufruf der Funktion
|
||||
best_depth, best_accuracy = plot_surrogate_accuracy_vs_depth_test_only(rf_model, X_train, X_test)
|
||||
print(f"Optimale Baumtiefe: {best_depth} mit einer Genauigkeit von {best_accuracy:.4f}")
|
||||
3
extracted_cells/cell26.py
Normal file
3
extracted_cells/cell26.py
Normal file
@ -0,0 +1,3 @@
|
||||
# Für Quarto Präsentationen
|
||||
import extract_cells
|
||||
extract_cells.extractToFiles("Explainable_AI_Adult_Census_Income.ipynb")
|
||||
3
extracted_cells/cell3.py
Normal file
3
extracted_cells/cell3.py
Normal file
@ -0,0 +1,3 @@
|
||||
print("\nDatentypen:")
|
||||
#print(df.dtypes)
|
||||
df.info()
|
||||
2
extracted_cells/cell4.py
Normal file
2
extracted_cells/cell4.py
Normal file
@ -0,0 +1,2 @@
|
||||
print("\nFehlende Werte:")
|
||||
df.isnull().sum()
|
||||
2
extracted_cells/cell5.py
Normal file
2
extracted_cells/cell5.py
Normal file
@ -0,0 +1,2 @@
|
||||
print("\nStatistische Zusammenfassung:")
|
||||
df.describe()
|
||||
13
extracted_cells/cell6.py
Normal file
13
extracted_cells/cell6.py
Normal file
@ -0,0 +1,13 @@
|
||||
# Überprüfen der Verteilung der Zielklasse
|
||||
plt.figure(figsize=(8, 6))
|
||||
sns.countplot(x='income', data=df)
|
||||
plt.title('Verteilung der Einkommensklassen')
|
||||
plt.xlabel('Einkommen')
|
||||
plt.ylabel('Anzahl')
|
||||
plt.savefig('output/Verteilung_Einkommensklassen.png', dpi=300)
|
||||
plt.show()
|
||||
|
||||
# Prozentuale Verteilung berechnen
|
||||
income_counts = df['income'].value_counts(normalize=True) * 100
|
||||
print("Prozentuale Verteilung der Einkommensklassen:")
|
||||
print(income_counts)
|
||||
5
extracted_cells/cell7.py
Normal file
5
extracted_cells/cell7.py
Normal file
@ -0,0 +1,5 @@
|
||||
# Überprüfen auf fehlende Werte oder '?'
|
||||
for col in df.columns:
|
||||
missing_count = df[df[col] == '?'].shape[0]
|
||||
if missing_count > 0:
|
||||
print(f"Spalte '{col}' hat {missing_count} Einträge mit '?'")
|
||||
13
extracted_cells/cell8.py
Normal file
13
extracted_cells/cell8.py
Normal file
@ -0,0 +1,13 @@
|
||||
# Ersetzen von '?' durch NaN und dann durch den häufigsten Wert
|
||||
df_clean = df.copy()
|
||||
|
||||
for col in df_clean.columns:
|
||||
if df_clean[col].dtype == 'object':
|
||||
# Ersetze '?' durch NaN
|
||||
df_clean[col] = df_clean[col].replace('?', np.nan)
|
||||
|
||||
# Ersetze NaN durch den häufigsten Wert
|
||||
most_frequent = df_clean[col].mode()[0]
|
||||
df_clean[col] = df_clean[col].fillna(most_frequent)
|
||||
|
||||
df_clean.head(10)
|
||||
20
extracted_cells/cell9.py
Normal file
20
extracted_cells/cell9.py
Normal file
@ -0,0 +1,20 @@
|
||||
# Kategorische Variablen in numerische umwandeln
|
||||
categorical_cols = df_clean.select_dtypes(include=['object']).columns
|
||||
print("Kategorische Spalten:", categorical_cols.tolist())
|
||||
|
||||
# Label Encoding für die Zielvariable
|
||||
label_encoder = LabelEncoder()
|
||||
df_clean['income_encoded'] = label_encoder.fit_transform(df_clean['income'])
|
||||
print("\nLabel Encoding für 'income':")
|
||||
for i, label in enumerate(label_encoder.classes_):
|
||||
print(f"{label} -> {i}")
|
||||
|
||||
# One-Hot Encoding für kategorische Variablen (außer der Zielvariable)
|
||||
categorical_cols = categorical_cols.drop('income')
|
||||
df_encoded = pd.get_dummies(df_clean, columns=categorical_cols, drop_first=False)
|
||||
|
||||
|
||||
print("\nNeue Spalten durch One-Hot Encoding:")
|
||||
print(df_encoded.columns[:10].tolist())
|
||||
|
||||
print("\nDatensatz nach Vorverarbeitung:", df_encoded.shape)
|
||||
Reference in New Issue
Block a user