Classification
In [1]:
Copied!
import numpy as np
import pandas as pd
import holmc as hc
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from ucimlrepo import fetch_ucirepo
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import holmc as hc
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from ucimlrepo import fetch_ucirepo
from sklearn.model_selection import train_test_split
In [2]:
Copied!
mushroom = fetch_ucirepo(id=73)
X_raw = mushroom.data.features
y_raw = mushroom.data.targets
# Step 1: Convert to pandas DataFrame / Series
X_df = pd.DataFrame(X_raw)
y_series = pd.Series(y_raw.values.ravel(), name="target")
# Step 2: Split the raw data (not encoded yet!)
X_train_df, X_test_df, y_train_series, y_test_series = train_test_split(
X_df, y_series, test_size=0.30, random_state=42, stratify=y_series
)
# Step 3: Encode labels (target values)
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train_series)
y_test = label_encoder.transform(y_test_series)
# Step 4: One-hot encode features
encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
X_train_encoded = encoder.fit_transform(X_train_df)
X_test_encoded = encoder.transform(X_test_df)
# Step 5: Add intercept term (column of 1s)
X_train = np.hstack((np.ones((X_train_encoded.shape[0], 1)), X_train_encoded))
X_test = np.hstack((np.ones((X_test_encoded.shape[0], 1)), X_test_encoded))
mushroom = fetch_ucirepo(id=73)
X_raw = mushroom.data.features
y_raw = mushroom.data.targets
# Step 1: Convert to pandas DataFrame / Series
X_df = pd.DataFrame(X_raw)
y_series = pd.Series(y_raw.values.ravel(), name="target")
# Step 2: Split the raw data (not encoded yet!)
X_train_df, X_test_df, y_train_series, y_test_series = train_test_split(
X_df, y_series, test_size=0.30, random_state=42, stratify=y_series
)
# Step 3: Encode labels (target values)
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train_series)
y_test = label_encoder.transform(y_test_series)
# Step 4: One-hot encode features
encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
X_train_encoded = encoder.fit_transform(X_train_df)
X_test_encoded = encoder.transform(X_test_df)
# Step 5: Add intercept term (column of 1s)
X_train = np.hstack((np.ones((X_train_encoded.shape[0], 1)), X_train_encoded))
X_test = np.hstack((np.ones((X_test_encoded.shape[0], 1)), X_test_encoded))
In [3]:
Copied!
seed = 101
np.random.seed(seed)
N = 150 # number of samples
lamb = 25
seed = 101
np.random.seed(seed)
N = 150 # number of samples
lamb = 25
In [4]:
Copied!
etas = [0.00001, 0.0001, 0.0004, 0.0008, 0.003, 0.006, 0.009, 0.012]
gammas = [1, 5, 10, 20, 40, 80]
xis = [1, 5, 10, 20, 40, 80]
etas = [0.00001, 0.0001, 0.0004, 0.0008, 0.003, 0.006, 0.009, 0.012]
gammas = [1, 5, 10, 20, 40, 80]
xis = [1, 5, 10, 20, 40, 80]
In [5]:
Copied!
search_o3 = hc.GridSearchClassification(
gammas=gammas, etas=etas, xis=xis, N=N, seed=seed,show_progress=True
)
o3paramspace=search_o3.run(X=X_train, y=y_train, lamb=lamb)
o3best_gamma = o3paramspace.iloc[0,0]
o3best_eta = o3paramspace.iloc[0,1]
o3best_xi = o3paramspace.iloc[0,3]
o3p = hc.O3Params(gamma=o3best_gamma, eta=o3best_eta, xi=o3best_xi)
sampler3 = hc.HoLMCSamplerO3Classification(
params=o3p, N=N, seed=seed,
show_progress=True
)
sample3 = sampler3.sample(X=X_train, y=y_train, lamb=lamb)
metric = hc.AccuracyMeasure(X=X_test, y=y_test)
accuracies = {}
StDev = {}
accuracies['Order 3'] = metric.compute_accuracy(sample3)
StDev['Order 3'] = np.std(accuracies['Order 3'], axis=0)
# Sampling with order 4
search_o4 = hc.GridSearchClassification(
gammas=gammas, etas=etas, N=N, seed=seed,show_progress=True
)
o4paramspace=search_o4.run(X=X_train, y=y_train, lamb=lamb)
o4best_gamma = o4paramspace.iloc[0,0]
o4best_eta = o4paramspace.iloc[0,1]
o4p = hc.O4Params(gamma=o4best_gamma, eta=o4best_eta)
sampler4 = hc.HoLMCSamplerO4Classification(
params=o4p, N=N, seed=seed, show_progress=True
)
sample4 = sampler4.sample(X=X_train, y=y_train, lamb=lamb)
accuracies['Order 4'] = metric.compute_accuracy(sample4)
StDev['Order 4'] = np.std(accuracies['Order 4'], axis=0)
search_o3 = hc.GridSearchClassification(
gammas=gammas, etas=etas, xis=xis, N=N, seed=seed,show_progress=True
)
o3paramspace=search_o3.run(X=X_train, y=y_train, lamb=lamb)
o3best_gamma = o3paramspace.iloc[0,0]
o3best_eta = o3paramspace.iloc[0,1]
o3best_xi = o3paramspace.iloc[0,3]
o3p = hc.O3Params(gamma=o3best_gamma, eta=o3best_eta, xi=o3best_xi)
sampler3 = hc.HoLMCSamplerO3Classification(
params=o3p, N=N, seed=seed,
show_progress=True
)
sample3 = sampler3.sample(X=X_train, y=y_train, lamb=lamb)
metric = hc.AccuracyMeasure(X=X_test, y=y_test)
accuracies = {}
StDev = {}
accuracies['Order 3'] = metric.compute_accuracy(sample3)
StDev['Order 3'] = np.std(accuracies['Order 3'], axis=0)
# Sampling with order 4
search_o4 = hc.GridSearchClassification(
gammas=gammas, etas=etas, N=N, seed=seed,show_progress=True
)
o4paramspace=search_o4.run(X=X_train, y=y_train, lamb=lamb)
o4best_gamma = o4paramspace.iloc[0,0]
o4best_eta = o4paramspace.iloc[0,1]
o4p = hc.O4Params(gamma=o4best_gamma, eta=o4best_eta)
sampler4 = hc.HoLMCSamplerO4Classification(
params=o4p, N=N, seed=seed, show_progress=True
)
sample4 = sampler4.sample(X=X_train, y=y_train, lamb=lamb)
accuracies['Order 4'] = metric.compute_accuracy(sample4)
StDev['Order 4'] = np.std(accuracies['Order 4'], axis=0)
Eta: 100%|██████████| 8/8 [01:15<00:00, 9.49s/it] 100%|██████████| 150/150 [00:00<00:00, 1162.34it/s] Eta: 100%|██████████| 8/8 [00:30<00:00, 3.76s/it] 100%|██████████| 150/150 [00:00<00:00, 300.10it/s]
In [6]:
Copied!
# Plotting
figure, ax = plt.subplots(figsize=(8.8, 6.6))
index = list(range(N+1))
label_str3 = (
"Order 3\n"
f"$\\gamma$ = {o3best_gamma}\n"
f"$\\eta$ = {o3best_eta}\n"
f"$\\xi$ = {o3best_xi}"
)
label_str4 = (
"Order 4\n"
f"$\\gamma$ = {o4best_gamma}\n"
f"$\\eta$ = {o4best_eta}\n"
)
plt.plot(index, accuracies['Order 3'], label=label_str3, color='blue', linewidth=3)
plt.fill_between(index,
accuracies['Order 3'] - 0.5 * StDev['Order 3'],
(accuracies['Order 3'] + 0.5 * StDev['Order 3']),
color='blue', alpha=0.2)
plt.plot(index, accuracies['Order 4'], label=label_str4, color='green', linewidth=3)
plt.fill_between(index,
accuracies['Order 4'] - 0.5 * StDev['Order 4'],
accuracies['Order 4'] +0.5 * StDev['Order 4'],
color='green', alpha=0.2)
plt.xlabel("Iteration", fontsize=30)
plt.ylabel("Accuracy", fontsize=30)
plt.legend(fontsize=30)
ax.tick_params(axis='both', which='major', labelsize=22)
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
label.set_fontname('Times New Roman')
plt.tight_layout()
plt.ylim(0.45, 1)
plt.savefig('../images/o3o4class.png')
plt.show()
# Plotting
figure, ax = plt.subplots(figsize=(8.8, 6.6))
index = list(range(N+1))
label_str3 = (
"Order 3\n"
f"$\\gamma$ = {o3best_gamma}\n"
f"$\\eta$ = {o3best_eta}\n"
f"$\\xi$ = {o3best_xi}"
)
label_str4 = (
"Order 4\n"
f"$\\gamma$ = {o4best_gamma}\n"
f"$\\eta$ = {o4best_eta}\n"
)
plt.plot(index, accuracies['Order 3'], label=label_str3, color='blue', linewidth=3)
plt.fill_between(index,
accuracies['Order 3'] - 0.5 * StDev['Order 3'],
(accuracies['Order 3'] + 0.5 * StDev['Order 3']),
color='blue', alpha=0.2)
plt.plot(index, accuracies['Order 4'], label=label_str4, color='green', linewidth=3)
plt.fill_between(index,
accuracies['Order 4'] - 0.5 * StDev['Order 4'],
accuracies['Order 4'] +0.5 * StDev['Order 4'],
color='green', alpha=0.2)
plt.xlabel("Iteration", fontsize=30)
plt.ylabel("Accuracy", fontsize=30)
plt.legend(fontsize=30)
ax.tick_params(axis='both', which='major', labelsize=22)
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
label.set_fontname('Times New Roman')
plt.tight_layout()
plt.ylim(0.45, 1)
plt.savefig('../images/o3o4class.png')
plt.show()
In [7]:
Copied!
gamma_vals = [1, 5, 10, 20]
eta = o3best_eta
xi = o3best_xi
AccForGammaVar_o3 = {}
for gamma in gamma_vals:
o3p = hc.O3Params(gamma=gamma, eta=eta, xi=xi)
sampler = hc.HoLMCSamplerO3Classification(
params=o3p, N=N, seed=seed, show_progress=False
)
sample = sampler.sample(X=X_train, y=y_train, lamb=lamb)
AccForGammaVar_o3[gamma] = metric.compute_accuracy(sample)
AccForGammaVar_o3df = pd.DataFrame(AccForGammaVar_o3)
figure, ax = plt.subplots(figsize=(8.8, 6.6))
for col, gamma in zip(AccForGammaVar_o3df, gamma_vals):
plt.plot(
AccForGammaVar_o3df.index,
AccForGammaVar_o3df[col],
label=f"$\\gamma = {gamma}$",
linewidth=3
)
plt.xlabel("Iteration", fontsize=30)
plt.ylabel("Accuracy", fontsize=30)
plt.legend(fontsize=14, loc="right")
ax.tick_params(axis='both', which='major', labelsize=22)
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
label.set_fontname('Times New Roman')
plt.tight_layout()
plt.ylim(0.9,1)
plt.savefig('../images/o3class_gamma_var.png')
plt.show()
gamma_vals = [1, 5, 10, 20]
eta = o3best_eta
xi = o3best_xi
AccForGammaVar_o3 = {}
for gamma in gamma_vals:
o3p = hc.O3Params(gamma=gamma, eta=eta, xi=xi)
sampler = hc.HoLMCSamplerO3Classification(
params=o3p, N=N, seed=seed, show_progress=False
)
sample = sampler.sample(X=X_train, y=y_train, lamb=lamb)
AccForGammaVar_o3[gamma] = metric.compute_accuracy(sample)
AccForGammaVar_o3df = pd.DataFrame(AccForGammaVar_o3)
figure, ax = plt.subplots(figsize=(8.8, 6.6))
for col, gamma in zip(AccForGammaVar_o3df, gamma_vals):
plt.plot(
AccForGammaVar_o3df.index,
AccForGammaVar_o3df[col],
label=f"$\\gamma = {gamma}$",
linewidth=3
)
plt.xlabel("Iteration", fontsize=30)
plt.ylabel("Accuracy", fontsize=30)
plt.legend(fontsize=14, loc="right")
ax.tick_params(axis='both', which='major', labelsize=22)
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
label.set_fontname('Times New Roman')
plt.tight_layout()
plt.ylim(0.9,1)
plt.savefig('../images/o3class_gamma_var.png')
plt.show()
In [8]:
Copied!
gamma_vals = [1, 5, 10, 20]
eta = o4best_eta
AccForGammaVar_o4 = {}
for gamma in gamma_vals:
o4p = hc.O4Params(gamma=gamma, eta=eta)
sampler = hc.HoLMCSamplerO4Classification(
params=o4p, N=N, seed=seed, show_progress=False
)
sample = sampler.sample(X=X_train, y=y_train, lamb=lamb)
AccForGammaVar_o4[gamma] = metric.compute_accuracy(sample)
AccForGammaVar_o4df = pd.DataFrame(AccForGammaVar_o4)
figure, ax = plt.subplots(figsize=(8.8, 6.6))
for col, gamma in zip(AccForGammaVar_o4df, gamma_vals):
plt.plot(
AccForGammaVar_o4df.index,
AccForGammaVar_o4df[col],
label=f"$\\gamma = {gamma}$",
linewidth=3
)
plt.xlabel("Iteration", fontsize=30)
plt.ylabel("Accuracy", fontsize=30)
plt.legend(fontsize=14, loc="right")
ax.tick_params(axis='both', which='major', labelsize=22)
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
label.set_fontname('Times New Roman')
plt.tight_layout()
plt.ylim(0.9,1)
plt.savefig('../images/o4class_gamma_var.png')
plt.show()
gamma_vals = [1, 5, 10, 20]
eta = o4best_eta
AccForGammaVar_o4 = {}
for gamma in gamma_vals:
o4p = hc.O4Params(gamma=gamma, eta=eta)
sampler = hc.HoLMCSamplerO4Classification(
params=o4p, N=N, seed=seed, show_progress=False
)
sample = sampler.sample(X=X_train, y=y_train, lamb=lamb)
AccForGammaVar_o4[gamma] = metric.compute_accuracy(sample)
AccForGammaVar_o4df = pd.DataFrame(AccForGammaVar_o4)
figure, ax = plt.subplots(figsize=(8.8, 6.6))
for col, gamma in zip(AccForGammaVar_o4df, gamma_vals):
plt.plot(
AccForGammaVar_o4df.index,
AccForGammaVar_o4df[col],
label=f"$\\gamma = {gamma}$",
linewidth=3
)
plt.xlabel("Iteration", fontsize=30)
plt.ylabel("Accuracy", fontsize=30)
plt.legend(fontsize=14, loc="right")
ax.tick_params(axis='both', which='major', labelsize=22)
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
label.set_fontname('Times New Roman')
plt.tight_layout()
plt.ylim(0.9,1)
plt.savefig('../images/o4class_gamma_var.png')
plt.show()
In [9]:
Copied!
o3paramspace
o3paramspace
Out[9]:
gamma | eta | MaxAcc | xi | |
---|---|---|---|---|
190 | 5 | 0.00600 | 0.9942 | 40 |
220 | 1 | 0.00900 | 0.9940 | 40 |
181 | 1 | 0.00600 | 0.9940 | 5 |
191 | 5 | 0.00600 | 0.9940 | 80 |
221 | 1 | 0.00900 | 0.9940 | 80 |
... | ... | ... | ... | ... |
19 | 20 | 0.00001 | 0.4736 | 5 |
18 | 20 | 0.00001 | 0.4736 | 1 |
17 | 10 | 0.00001 | 0.4736 | 80 |
16 | 10 | 0.00001 | 0.4736 | 40 |
0 | 1 | 0.00001 | 0.4736 | 1 |
288 rows × 4 columns
In [10]:
Copied!
o4paramspace
o4paramspace
Out[10]:
gamma | eta | MaxAcc | |
---|---|---|---|
42 | 1 | 0.01200 | 0.9947 |
36 | 1 | 0.00900 | 0.9945 |
30 | 1 | 0.00600 | 0.9940 |
43 | 5 | 0.01200 | 0.9930 |
37 | 5 | 0.00900 | 0.9923 |
31 | 5 | 0.00600 | 0.9877 |
44 | 10 | 0.01200 | 0.9875 |
24 | 1 | 0.00300 | 0.9873 |
38 | 10 | 0.00900 | 0.9847 |
45 | 20 | 0.01200 | 0.9843 |
25 | 5 | 0.00300 | 0.9833 |
39 | 20 | 0.00900 | 0.9807 |
32 | 10 | 0.00600 | 0.9805 |
46 | 40 | 0.01200 | 0.9757 |
33 | 20 | 0.00600 | 0.9727 |
26 | 10 | 0.00300 | 0.9703 |
40 | 40 | 0.00900 | 0.9696 |
47 | 80 | 0.01200 | 0.9627 |
34 | 40 | 0.00600 | 0.9587 |
27 | 20 | 0.00300 | 0.9567 |
41 | 80 | 0.00900 | 0.9501 |
35 | 80 | 0.00600 | 0.9305 |
28 | 40 | 0.00300 | 0.9288 |
18 | 1 | 0.00080 | 0.9038 |
19 | 5 | 0.00080 | 0.9020 |
20 | 10 | 0.00080 | 0.8955 |
29 | 80 | 0.00300 | 0.8801 |
21 | 20 | 0.00080 | 0.8718 |
22 | 40 | 0.00080 | 0.7891 |
12 | 1 | 0.00040 | 0.7346 |
13 | 5 | 0.00040 | 0.7334 |
14 | 10 | 0.00040 | 0.7297 |
15 | 20 | 0.00040 | 0.7165 |
23 | 80 | 0.00080 | 0.6950 |
16 | 40 | 0.00040 | 0.6658 |
17 | 80 | 0.00040 | 0.5990 |
6 | 1 | 0.00010 | 0.4921 |
7 | 5 | 0.00010 | 0.4921 |
8 | 10 | 0.00010 | 0.4919 |
9 | 20 | 0.00010 | 0.4919 |
10 | 40 | 0.00010 | 0.4917 |
11 | 80 | 0.00010 | 0.4907 |
1 | 5 | 0.00001 | 0.4757 |
5 | 80 | 0.00001 | 0.4757 |
4 | 40 | 0.00001 | 0.4757 |
3 | 20 | 0.00001 | 0.4757 |
2 | 10 | 0.00001 | 0.4757 |
0 | 1 | 0.00001 | 0.4757 |
In [ ]:
Copied!