import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

# Load data
column_names = ['ID', 'diagnosis'] + [f'{feat}_{stat}' for stat in ['mean', 'se', 'worst'] 
                                      for feat in ['radius', 'texture', 'perimeter', 'area', 
                                                   'smoothness', 'compactness', 'concavity', 
                                                   'concave_points', 'symmetry', 'fractal_dimension']]
data = pd.read_csv('wdbc.data', header=None, names=column_names)
X = data.drop(['ID', 'diagnosis'], axis=1)
y = data['diagnosis'].map({'B': 0, 'M': 1})

plt.figure(figsize=(10,4))
for i, feature in enumerate(['radius_mean', 'texture_mean']):
    plt.subplot(1, 2, i+1)
    for label, group in data.groupby('diagnosis'):
        plt.hist(group[feature], alpha=0.5, label=label)
    plt.title(feature)
    plt.legend()
plt.tight_layout()
plt.show()

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# Default (Gini) tree
clf_gini = DecisionTreeClassifier(criterion='gini', random_state=42)
clf_gini.fit(X_train, y_train)

y_pred = clf_gini.predict(X_test)
print('Classification Report (Gini):')
print(classification_report(y_test, y_pred, target_names=['Benign', 'Malignant']))

Classification Report (Gini):
              precision    recall  f1-score   support

      Benign       0.97      0.95      0.96       187
   Malignant       0.90      0.94      0.92        98

    accuracy                           0.94       285
   macro avg       0.93      0.94      0.94       285
weighted avg       0.94      0.94      0.94       285

plt.figure(figsize=(12,8))
plot_tree(clf_gini, max_depth=2, feature_names=X.columns, class_names=['Benign', 'Malignant'], filled=True)
plt.show()

# Entropy criterion
clf_entropy = DecisionTreeClassifier(criterion='entropy', random_state=42)
clf_entropy.fit(X_train, y_train)

y_pred_e = clf_entropy.predict(X_test)
print('Classification Report (Entropy):')
print(classification_report(y_test, y_pred_e, target_names=['Benign', 'Malignant']))

Classification Report (Entropy):
              precision    recall  f1-score   support

      Benign       0.97      0.90      0.94       187
   Malignant       0.84      0.95      0.89        98

    accuracy                           0.92       285
   macro avg       0.90      0.93      0.91       285
weighted avg       0.93      0.92      0.92       285

plt.figure(figsize=(12,8))
plot_tree(clf_entropy, max_depth=2, feature_names=X.columns, class_names=['Benign', 'Malignant'], filled=True)
plt.show()

depths = range(2, 12)
accs = []
for d in depths:
    clf = DecisionTreeClassifier(criterion='entropy', max_depth=d, random_state=42)
    clf.fit(X_train, y_train)
    accs.append(accuracy_score(y_test, clf.predict(X_test)))
plt.plot(depths, accs, marker='o')
plt.xlabel('max_depth')
plt.ylabel('Test Accuracy')
plt.title('Decision Tree: Depth vs. Accuracy')
plt.grid(True)

depths = [3, 5, 7]
results = []
for d in depths:
    clf = DecisionTreeClassifier(criterion='entropy', max_depth=d, random_state=42)
    clf.fit(X_train, y_train)
    y_pred_d = clf.predict(X_test)
    results.append({
        'max_depth': d,
        'accuracy': accuracy_score(y_test, y_pred_d),
        'precision': precision_score(y_test, y_pred_d),
        'recall': recall_score(y_test, y_pred_d),
        'f1': f1_score(y_test, y_pred_d)
    })

metrics_df = pd.DataFrame(results)
print(metrics_df)

   max_depth  accuracy  precision    recall        f1
0          3  0.940351   0.917526  0.908163  0.912821
1          5  0.926316   0.866667  0.928571  0.896552
2          7  0.919298   0.837838  0.948980  0.889952

for metric in ['accuracy', 'precision', 'recall', 'f1']:
    plt.figure()
    plt.plot(metrics_df['max_depth'], metrics_df[metric])
    plt.xlabel('max_depth')
    plt.ylabel(metric)
    plt.title(f'{metric} vs. max_depth')
    plt.show()

split_results = []
for p in np.arange(0.2, 0.9, 0.1):
    X_tr, X_te, y_tr, y_te = train_test_split(X, y, train_size=p, random_state=42)
    clf = DecisionTreeClassifier(criterion='entropy', max_depth=5, random_state=42)
    clf.fit(X_tr, y_tr)
    split_results.append({
        'train_pct': p,
        'n_train_samples': len(y_tr),
        'n_test_samples': len(y_te),
        'train_acc': accuracy_score(y_tr, clf.predict(X_tr)),
        'test_acc': accuracy_score(y_te, clf.predict(X_te))
    })

split_df = pd.DataFrame(split_results)
print(split_df)

   train_pct  n_train_samples  n_test_samples  train_acc  test_acc
0        0.2              113             456   1.000000  0.896930
1        0.3              170             399   1.000000  0.909774
2        0.4              227             342   1.000000  0.929825
3        0.5              284             285   0.985915  0.926316
4        0.6              341             228   0.991202  0.960526
5        0.7              398             171   0.992462  0.953216
6        0.8              455             114   0.993407  0.947368

plt.figure()
plt.plot(split_df['train_pct'], split_df['train_acc'])
plt.xlabel('train_pct')
plt.ylabel('Train Accuracy')
plt.title('Train Accuracy vs. Training Set Percentage')
plt.show()

plt.figure()
plt.plot(split_df['train_pct'], split_df['test_acc'])
plt.xlabel('train_pct')
plt.ylabel('Test Accuracy')
plt.title('Test Accuracy vs. Training Set Percentage')
plt.show()

max_depth	Accuracy ↓	Precision ↓	Recall ↑	F₁ ↓
3	94.04 %	91.75 %	90.82 %	91.28 %
5	92.63 %	86.67 %	92.86 %	89.66 %
7	91.93 %	83.78 %	94.90 %	88.99 %

Decision Tree Classification on Wisconsin Breast Cancer Data¶

1. Setup & Data Loading¶

Data Description¶

Task Definition & Model Choice¶

Summary Visualization¶

Part 1: Decision Tree (Gini)¶

Tree Visualization (Gini, depth=2)¶

Discussion of Gini-Based Decision Tree Results¶

Decision Tree with Entropy¶

Discussion of Entropy-Based Decision Tree Results¶

Part 2: Impact of max_depth¶

Exploratory Analysis of Decision Tree Depth¶

Interpretation of `max_depth` Results¶

Recommendations¶

Part 3: Train/Test Split Experiment¶

Train/Test Split Analysis¶

Conclusions¶

Decision Tree Classification on Wisconsin Breast Cancer Data¶

1. Setup & Data Loading¶

Data Description¶

Task Definition & Model Choice¶

Summary Visualization¶

Part 1: Decision Tree (Gini)¶

Tree Visualization (Gini, depth=2)¶

Discussion of Gini-Based Decision Tree Results¶

Decision Tree with Entropy¶

Discussion of Entropy-Based Decision Tree Results¶

Part 2: Impact of max_depth¶

Exploratory Analysis of Decision Tree Depth¶

Interpretation of max_depth Results¶

Recommendations¶

Part 3: Train/Test Split Experiment¶

Train/Test Split Analysis¶

Conclusions¶

Interpretation of `max_depth` Results¶