Wednesday, March 23, 2022

Random Forest Hyperparameters Python

Code 1

 (.env) [boris@fedora34server THREAD]$ cat randomForrestHyperv.py

# explore random forest bootstrap sample size on performance

from numpy import mean

from numpy import std

from numpy import arange

from sklearn.datasets import make_classification

from sklearn.model_selection import cross_val_score

from sklearn.model_selection import RepeatedStratifiedKFold

from sklearn.ensemble import RandomForestClassifier

from matplotlib import pyplot

# get the dataset

def get_dataset():

X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=3)

return X, y

# get a list of models to evaluate

def get_models():

models = dict()

# explore ratios from 10% to 100% in 10% increments

for i in arange(0.1, 1.1, 0.1):

key = '%.1f' % i

# set max_samples=None to use 100%

if i == 1.0:

i = None

models[key] = RandomForestClassifier(max_samples=i)

return models

 

# evaluate a given model using cross-validation

def evaluate_model(model, X, y):

# define the evaluation procedure

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

# evaluate the model and collect the results

scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=4)

return scores

 

# define dataset

X, y = get_dataset()

# get the models to evaluate

models = get_models()

# evaluate the models and store results

results, names = list(), list()

for name, model in models.items():

# evaluate the model

scores = evaluate_model(model, X, y)

# store the results

results.append(scores)

names.append(name)

# summarize the performance along the way

print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

# plot model performance for comparison

pyplot.boxplot(results, labels=names, showmeans=True)

pyplot.show(block=False)






























Code 2

# explore random forest number of features effect on performance
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from matplotlib import pyplot
 
# get the dataset
def get_dataset():
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=3)
return X, y
 
# get a list of models to evaluate
def get_models():
models = dict()
# explore number of features from 1 to 7
for i in range(1,8):
models[str(i)] = RandomForestClassifier(max_features=i)
return models
 
# evaluate a given model using cross-validation
def evaluate_model(model, X, y):
# define the evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate the model and collect the results
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=4)
return scores
 
# define dataset
X, y = get_dataset()
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
# evaluate the model
scores = evaluate_model(model, X, y)
# store the results
results.append(scores)
names.append(name)
# summarize the performance along the way
print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))
# plot model performance for comparison
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show(block=False)


















Code 3

(.env) [boris@fedora34server THREAD]$ cat jobsMulti.py from sklearn.ensemble import RandomForestClassifier from sklearn.feature_selection import RFE from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.datasets import make_classification X, y = make_classification(n_samples=1000, n_features=3200, n_informative=100, \
n_redundant=3100, n_classes=2, n_clusters_per_class=30) pipe = Pipeline([ ('slr', StandardScaler()), ('fs', RFE(RandomForestClassifier(n_estimators=1000, max_features='auto', \
class_weight='balanced', n_jobs=4), step=0.01, n_features_to_select=10)) ]) pipe.fit(X, y)
























References




No comments:

Post a Comment