Source code for jarvis.ai.pkgs.lgbm.regression

"""Modules for LightGBM regression."""

from sklearn.model_selection import (
    train_test_split,
    RandomizedSearchCV,
)
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import Pipeline
import lightgbm as lgb
from jarvis.ai.pkgs.utils import regr_scores
from collections import defaultdict
import numpy as np
import pickle
import joblib
import matplotlib.pyplot as plt
import scipy as sp


[docs]def regression( X=[], Y=[], jid=[], test_size=0.1, plot=False, preprocess=True, feature_importance=True, save_model=False, feat_names=[], model_name="my_model", config={}, ): """Get generic regression model.""" lgbm = lgb.LGBMRegressor( n_estimators=config["n_estimators"], learning_rate=config["learning_rate"], num_leaves=config["num_leaves"], ) info = defaultdict() X_train, X_test, y_train, y_test, jid_train, jid_test = train_test_split( X, Y, jid, random_state=1, test_size=test_size ) pipe = Pipeline( [ ("stdscal", StandardScaler()), ("vart", VarianceThreshold(1e-4)), ("est", lgbm), ] ) if preprocess: model = pipe else: model = lgbm model.fit(X_train, y_train) pred = model.predict(X_test) reg_sc = regr_scores(y_test, pred) info["reg_scores"] = reg_sc if feature_importance: imp_data = [] info["imp_data"] = imp_data if not preprocess: feat_imp = model.feature_importances_ feat_imp = 100 * np.array( [float(i) / float(np.sum(feat_imp)) for i in feat_imp] ) for f in range(len(feat_imp)): imp_data.append([feat_imp[f], feat_names[f]]) else: feat_imp = model.named_steps["est"].feature_importances_ feat_imp = 100 * np.array( [float(i) / float(np.sum(feat_imp)) for i in feat_imp] ) keep_indices = model.named_steps["vart"].get_support(indices=True) indices = np.argsort(feat_imp)[::-1] new_feat_imp = feat_imp[indices] new_indices = keep_indices[indices] for f in range(len(new_feat_imp)): imp_data.append([new_feat_imp[f], feat_names[new_indices[f]]]) print( model_name, round(reg_sc["mae"], 3), round(reg_sc["rmse"], 3), round(reg_sc["r2"], 3), ) if plot: plt.plot( reg_sc["pred"], reg_sc["test"], ".", label=str(type(model).__name__)[0:4], ) plt.legend() plt.xlabel("DFT") plt.ylabel("ML") if save_model: pk = str(model_name) + str(".pk") jb = str(model_name) + str(".jb") # js = str(model_name )+str('.js') pickle.dump(model, open(pk, "wb")) joblib.dump(model, jb) # TODO: implemet something like sklearn-json # json.dump(model.get_params(), open(js, "w")) return info
default_param_dist = { # 'boosting_type': [ 'dart'], # 'boosting_type': ['gbdt', 'dart', 'rf'], # 'num_leaves': sp.stats.randint(2, 1001), # 'subsample_for_bin': sp.stats.randint(10, 1001), # 'min_split_gain': sp.stats.uniform(0, 5.0), # 'min_child_weight': sp.stats.uniform(1e-6, 1e-2), # 'reg_alpha': sp.stats.uniform(0, 1e-2), # 'reg_lambda': sp.stats.uniform(0, 1e-2), # 'tree_learner': ['data', 'feature', 'serial', 'voting' ], # 'application': ['regression_l1', 'regression_l2', 'regression'], # 'bagging_freq': sp.stats.randint(1, 11), # 'bagging_fraction': sp.stats.uniform(.1, 0.9), # 'feature_fraction': sp.stats.uniform(.1, 0.9), # 'learning_rate': sp.stats.uniform(1e-3, 0.9), # 'est__num_leaves': [2,8,16], # 'est__min_data_in_leaf': [1,2,4], # 'est__learning_rate': [0.005,0.01,0.1], # 'est__max_depth': [1,3,5], #sp.stats.randint(1, 501), # 'est__n_estimators': [num_iteration,2*num_iteration,5*num_iteration], # sp.stats.randint(100, 20001), # 'gpu_use_dp': [True, False], "est__min_data_in_leaf": sp.stats.randint(5, 20), "est__n_estimators": sp.stats.randint(500, 2000), "est__num_leaves": sp.stats.randint(100, 500), "est__max_depth": sp.stats.randint(8, 20), "est__learning_rate": sp.stats.uniform(5e-3, 0.5), }
[docs]def get_lgbm( train_x, val_x, train_y, val_y, cv, n_jobs, scoring, n_iter, objective, alpha, random_state, param_dist=default_param_dist, ): """ Train a lightgbm model. Args: train_x: samples used for trainiing val_x: validation set train_y: train targets val_y: validation targets cv: # of cross-validations n_jobs: for making the job parallel scoring: scoring function to use such as MAE Returns: Best estimator. """ # Get converged boosting iterations with high learning rate, # MAE as the convergence crietria lgbm = lgb.LGBMRegressor( n_estimators=500, learning_rate=0.1, max_depth=5, num_leaves=100, objective=objective, # min_data_in_leaf=2, n_jobs=-1, alpha=alpha, random_state=random_state, verbose=-1, ) lgbm.fit( train_x, train_y, eval_set=[(val_x, val_y)], eval_metric="mae", # eval_metric='l1', early_stopping_rounds=10, ) num_iteration = lgbm.best_iteration_ print("num_iteration", num_iteration) print("in randomsearch cv") # Generally thousands of randomized search for optimal parameters # learning rate and num_leaves are very important lgbm = lgb.LGBMRegressor( objective=objective, # device='gpu', # n_estimators=num_iteration, n_jobs=n_jobs, alpha=alpha, verbose=-1, ) pipe = Pipeline( [ ("stdscal", StandardScaler()), ("vart", VarianceThreshold(1e-4)), ("est", lgbm), ] ) # n_iter=10 # Increase n_iter for production runs. rscv = RandomizedSearchCV( estimator=pipe, param_distributions=param_dist, cv=cv, scoring=scoring, n_iter=n_iter, n_jobs=n_jobs, verbose=-1, random_state=random_state, refit=True, ) model = rscv.fit(train_x, train_y) print("Best Score: ", model.best_score_) print("Best Params: ", model.best_params_) print("Best Estimator: ", model.best_estimator_) # return model.best_estimator_ return model
[docs]def parameters_dict(): """Give example optimized parameters.""" parameters = { "optb88vdw_bandgap": { "n_estimators": 324, "learning_rate": 0.06414333047469417, "num_leaves": 31, }, "mbj_bandgap": { "n_estimators": 210, "learning_rate": 0.04727272041771037, "num_leaves": 121, }, "epsx": { "n_estimators": 139, "learning_rate": 0.10098329400041395, "num_leaves": 527, }, "epsy": { "n_estimators": 161, "learning_rate": 0.264679564828344, "num_leaves": 29, }, "epsz": { "n_estimators": 161, "learning_rate": 0.264679564828344, "num_leaves": 29, }, "mepsx": { "n_estimators": 75, "learning_rate": 0.05374708509141705, "num_leaves": 242, }, "mepsy": { "n_estimators": 120, "learning_rate": 0.12048289662270327, "num_leaves": 398, }, "mepsz": { "n_estimators": 89, "learning_rate": 0.09718152788954888, "num_leaves": 938, }, "encut": { "n_estimators": 376, "learning_rate": 0.08982089572506267, "num_leaves": 762, }, "kpoint_length_unit": { "n_estimators": 236, "learning_rate": 0.03234907667844313, "num_leaves": 794, }, "bulk_modulus_kv": { "n_estimators": 380, "learning_rate": 0.08621497083536021, "num_leaves": 202, }, "shear_modulus_gv": { "n_estimators": 284, "learning_rate": 0.017555838240950795, "num_leaves": 579, }, "formation_energy_peratom": { "n_estimators": 1170, "learning_rate": 0.15375236057119931, "num_leaves": 273, }, "exfoliation_energy": { "n_estimators": 47, "learning_rate": 0.0734095239247927, "num_leaves": 326, }, "max_ir_mode": { "n_estimators": 500, "learning_rate": 0.0734095239247927, "num_leaves": 100, }, } return parameters
""" if __name__ == "__main__": from jarvis.ai.pkgs.utils import get_ml_data from jarvis.ai.descriptors.cfid import feat_names property = "exfoliation_energy" property='formation_energy_peratom' params = parameters_dict()[property] print(params) X, Y, jid = get_ml_data(dataset="cfid_3d", ml_property=property) names = feat_names() info = regression(X=X, Y=Y, jid = jid, config=params, feat_names=names) print(info) """