Some issues related with Unified State Examination in Informatics in Russian Federation: Random Forest Hyperparameters Python

Code 1

(.env) [boris@fedora34server THREAD]$ cat randomForrestHyperv.py

# explore random forest bootstrap sample size on performance

from numpy import mean

from numpy import std

from numpy import arange

from sklearn.datasets import make_classification

from sklearn.model_selection import cross_val_score

from sklearn.model_selection import RepeatedStratifiedKFold

from sklearn.ensemble import RandomForestClassifier

from matplotlib import pyplot

# get the dataset

def get_dataset():

X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=3)

return X, y

# get a list of models to evaluate

def get_models():

models = dict()

# explore ratios from 10% to 100% in 10% increments

for i in arange(0.1, 1.1, 0.1):

key = '%.1f' % i

# set max_samples=None to use 100%

if i == 1.0:

i = None

models[key] = RandomForestClassifier(max_samples=i)

return models

# evaluate a given model using cross-validation

def evaluate_model(model, X, y):

# define the evaluation procedure

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

# evaluate the model and collect the results

scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=4)

return scores

# define dataset

X, y = get_dataset()

# get the models to evaluate

models = get_models()

# evaluate the models and store results

results, names = list(), list()

for name, model in models.items():

# evaluate the model

scores = evaluate_model(model, X, y)

# store the results

results.append(scores)

names.append(name)

# summarize the performance along the way

print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

# plot model performance for comparison

pyplot.boxplot(results, labels=names, showmeans=True)

pyplot.show(block=False)

Code 2

# explore random forest number of features effect on performance

from numpy import mean

from numpy import std

from sklearn.datasets import make_classification

from sklearn.model_selection import cross_val_score

from sklearn.model_selection import RepeatedStratifiedKFold

from sklearn.ensemble import RandomForestClassifier

from matplotlib import pyplot

# get the dataset

def get_dataset():

	X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=3)

	return X, y

# get a list of models to evaluate

def get_models():

	models = dict()

	# explore number of features from 1 to 7

	for i in range(1,8):

		models[str(i)] = RandomForestClassifier(max_features=i)

	return models

# evaluate a given model using cross-validation

def evaluate_model(model, X, y):

	# define the evaluation procedure

	cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

	# evaluate the model and collect the results

	scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=4)

	return scores

# define dataset

X, y = get_dataset()

# get the models to evaluate

models = get_models()

# evaluate the models and store results

results, names = list(), list()

for name, model in models.items():

	# evaluate the model

	scores = evaluate_model(model, X, y)

	# store the results

	results.append(scores)

	names.append(name)

	# summarize the performance along the way

	print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

# plot model performance for comparison

pyplot.boxplot(results, labels=names, showmeans=True)

pyplot.show(block=False)

Code 3

(.env) [boris@fedora34server THREAD]$ cat jobsMulti.py from sklearn.ensemble import RandomForestClassifier from sklearn.feature_selection import RFE from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.datasets import make_classification X, y = make_classification(n_samples=1000, n_features=3200, n_informative=100, \

n_redundant=3100, n_classes=2, n_clusters_per_class=30) pipe = Pipeline([ ('slr', StandardScaler()), ('fs', RFE(RandomForestClassifier(n_estimators=1000, max_features='auto', \

class_weight='balanced', n_jobs=4), step=0.01, n_features_to_select=10)) ]) pipe.fit(X, y)