I am building a random forest via xgboost following the instruction at https://xgboost.readthedocs.io/en/latest/tutorials/rf.html; moreover, FLAML (GitHub - microsoft/FLAML: A fast library for AutoML and tuning. Join our Discord: https://discord.gg/Cppx2vSPVP.) is used for hyperparameter optimization. The python codes are like below:
!pip install flaml;
from flaml.data import load_openml_dataset
from xgboost import XGBClassifier
from flaml.model import BaseEstimator
X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id=1169, data_dir='./')
from flaml import AutoML
from flaml import tune
automl = AutoML()
num_cores = 16
randomseed = 19850606
class MyMonotonicRandomForestClassifier(BaseEstimator):
def __init__(self, task = 'binary:logistic', n_jobs = num_cores, **params):
super().__init__(task, **params)
self.estimator_class = XGBClassifier
# convert to int for integer hyperparameters
self.params = {
'n_jobs': params['n_jobs'] if 'n_jobs' in params else num_cores,
'booster':params['booster'] if 'booster' in params else 'gbtree',
'learning_rate': params['learning_rate'] if 'learning_rate' in params else 1,
'gamma': params['gamma'],
'reg_lambda': params['reg_lambda'],
'reg_alpha': params['reg_alpha'],
'max_depth': int(params['max_depth']),
'min_child_weight': int(params['min_child_weight']),
'subsample': params['subsample'],
'colsample_bylevel':params['colsample_bylevel'],
'num_parallel_tree':int(params['num_parallel_tree']),
'n_estimators':params['n_estimators'] if 'n_estimators' in params else 1,
'random_state': params['random_state'] if 'random_state' in params else randomseed
}
@classmethod
def search_space(cls, data_size, task):
space = {
'max_depth': {'domain': tune.uniform(lower=15, upper=100), 'init_value': 30},
'num_parallel_tree': {'domain': tune.uniform(lower = 5000, upper = 20000), 'init_value': 10000},
'min_child_weight': {'domain': tune.uniform(lower = 1, upper = 1000), 'init_value': 100},
'subsample': {'domain': tune.uniform(lower = 0.5, upper = 1), 'init_value': 0.67},
'colsample_bylevel': {'domain': tune.uniform(lower = 0.7, upper = 1), 'init_value': 0.9},
'gamma': {'domain': tune.loguniform(lower = 0.000000000001, upper = 0.001), 'init_value': 0.00001},
'reg_lambda': {'domain': tune.loguniform(lower = 0.000000000001, upper = 1), 'init_value': 1},
'reg_alpha': {'domain': tune.loguniform(lower = 0.000000000001, upper = 1), 'init_value': 0.000000000001},
}
return space
automl.add_learner(learner_name = 'MonotonicRandomForest', learner_class = MyMonotonicRandomForestClassifier)
settings = {
"time_budget": 300, # total running time in seconds
"metric": 'roc_auc',
"estimator_list": ['MonotonicRandomForest'], # list of ML learners
"task": 'classification', # task type
"log_file_name": 'airlines_experiment_custom.log', # flaml log file
"log_training_metric": True, # whether to log training metric
}
automl.fit(X_train = X_train, y_train = y_train,
X_val = X_test, y_val = y_test, **settings)
When running the codes above in Juypter on a Linux server using centos, from the backend I can see that 16 CPUs are activated and used for the model (num_cores = 16 as set) .
However, when I run the same codes in RStudio via reticulate, no matter how I set num_cores, only one CPU is active, indicating that the process becomes single-threaded.
In addition, for a 'traditional' xgboost with multiple rounds and num_parallel_tree = 1, there is no such issue in reticulate. The equivalent R codes also run well with multi-cores in RStudio.
Any ideas? The process takes a lot of time after becoming single-threaded, which is very unfavorable for my work.
Your help will be highly appreciated.