Source code for jarvis.ai.pkgs.sklearn.regression

"""
Simple ML models for regression.

Designed for educational purposes only.
"""
from collections import defaultdict
from jarvis.ai.pkgs.utils import regr_scores
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.ensemble import (
    RandomForestRegressor,
    GradientBoostingRegressor,
    AdaBoostRegressor,
)
from sklearn.svm import SVR
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.kernel_ridge import KernelRidge
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt

# Note that these models are with default parameters
# without any hyper-parameter otimizations
simple_regr_models = [
    GaussianProcessRegressor(),
    RandomForestRegressor(),
    GradientBoostingRegressor(),
    AdaBoostRegressor(),
    SVR(),
    Lasso(),
    LinearRegression(),
    KernelRidge(),
    MLPRegressor(),
    DecisionTreeRegressor(),
]


[docs]def regression(X=[], Y=[], plot=False, models=simple_regr_models, preprocess=True, test_size=0.1): """ Provide model as models to get accuracy. Args: X: input features Y: Target data models : collection array of models plot: whether to make a parity plot with ML models preprocess: whether to apply standard preprocessing techniques """ X_train, X_test, y_train, y_test = train_test_split( X, Y, random_state=1, test_size=test_size ) info = defaultdict() for i in models: pipe = Pipeline( [ ("stdscal", StandardScaler()), ("vart", VarianceThreshold(1e-4)), ("est", i), ] ) if preprocess: model = pipe else: model = i model.fit(X_train, y_train) pred = model.predict(X_test) reg_sc = regr_scores(y_test, pred) if plot: plt.plot( reg_sc["pred"], reg_sc["test"], ".", label=str(type(i).__name__)[0:4] ) print(type(i).__name__, round(reg_sc["mae"], 3), round(reg_sc["rmse"], 3)) info[type(i).__name__] = {} info[type(i).__name__]["mae"] = reg_sc["mae"] info[type(i).__name__]["rmse"] = reg_sc["rmse"] if plot: plt.legend() plt.xlabel("DFT") plt.ylabel("ML") plt.show() return info
""" if __name__ == "__main__": from jarvis.ai.pkgs.utils import get_ml_data X, Y, jid = get_ml_data(dataset = 'cfid_3d', ml_property='exfoliation_energy') info = regression(X, Y, models = simple_regr_models) print ('info',info) assert info['GradientBoostingRegressor']['mae']<50.0 """