Source code for jarvis.ai.uncertainty.gaussian_process_uncertainty

"""
Code to predict properties and their uncertainty.

ML model used: lgbm
"""
# Ref: https://doi.org/10.1021/acsomega.1c03752
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from jarvis.ai.pkgs.utils import regr_scores
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF
from sklearn.gaussian_process.kernels import WhiteKernel
from sklearn.gaussian_process.kernels import RationalQuadratic

# from sklearn.gaussian_process.kernels import ExpSineSquared
# from sklearn.gaussian_process.kernels import ConstantKernel as C
# import joblib
from joblib import dump
from collections import OrderedDict


[docs]def GaussianProcesses( x, y, jid, cv=2, n_jobs=-1, n_iter=10, random_state=508842607, scoring="neg_mean_absolute_error", prop="exfoliation_energy", write_files=True, ): """ Perform Gaussian Processes and determine prediction intervals. NOTE: it can deal with ~50 descriptors AT MAX ==> reduction of the CFID descriptors is needed """ # TODO: Make writing file in proper python format # STEP-2: Splitting the data # *************************** # 90-10% split for train test X_train, X_test, y_train, y_test, jid_train, jid_test = train_test_split( x, y, jid, random_state=1, test_size=0.1 ) # print ('lenx len y',len(x[0]),len(y)) # STEP-3: Use a specific ML modeli: GP in this case # ************************************************* # # Model definition # ================ # The Kernel can be chanded depending on the property under examination. # In this example we use a composite kernel that includes long range # effects (RBF), medium term irregularities (RationalQuadratic), and # white noise # Periodic trends can be added using ExpSineSquared # Note: RBF has one parameter for each descriptor used ==> if using a # different number of descriptors, the number of parameters has # to be changed RQ1 = 316**2 * RationalQuadratic( alpha=0.067, length_scale=43.5, length_scale_bounds=(1e0, 1e2), alpha_bounds=(1e-3, 1e1), ) RQ2 = 316**2 * RationalQuadratic( alpha=0.00473, length_scale=0.0796, length_scale_bounds=(1e-2, 1e0), alpha_bounds=(1e-3, 1e1), ) RBF1 = 316**2 * RBF( length_scale=[ 16, 1.48e03, 9.92e04, 0.858, 1.02, 2.58, 2.72e03, 2.22e03, 0.839, 0.5, 6.15, 130, 3.9e03, 3.1, 0.675, 6.04, 1.18e03, 2.9, 4.56e03, 333, 0.0167, 797, 2.37e03, 1.81e03, ] ) White = WhiteKernel(noise_level=0.7, noise_level_bounds=(0.05, 0.8)) kernel = RQ1 + RQ2 + RBF1 + White mid_model = GaussianProcessRegressor( kernel=kernel, alpha=0.0001, n_restarts_optimizer=3, normalize_y=True, random_state=int(774057201), ) # Model fitting # ============= scaler = StandardScaler().fit(X_train) scaler.transform(X_train) scaler.transform(X_test) mid_model.fit(X_train, y_train) dump(mid_model, "model.joblib") print("") print("Params:") print(mid_model.get_params(deep=True)) print("Initial: ", kernel) print("Optimum: ", mid_model.kernel_) print( "Log-Marginal-Likelihood: ", mid_model.log_marginal_likelihood(mid_model.kernel_.theta), ) print("") print("") # PREDICTIONS and UQ # ================== pred, sigma = mid_model.predict(X_test, return_std=True) print("Model mae rmse") reg_sc = regr_scores(y_test, pred) info = OrderedDict() info["MAE_Mid"] = reg_sc["mae"] actual = y_test print("Mid:", round(reg_sc["mae"], 3), round(reg_sc["rmse"], 3)) fout2 = open("Intervals1.dat", "w") line0 = "# Jid Observed pred_Lower pred_Mid " line1 = line0 + "RealErr(Mid) " line = line1 + "pred_Upper PredError Pred_inBounds\n" fout2.write(line) sum = 0.0 count = 0 MAE_err = 0.0 for ii in range(len(y_test)): ss = float(sigma[ii]) pp = float(pred[ii]) sum = sum + ss lower = pp - ss upper = pp + ss err = ss real_err = float(abs(y_test[ii] - pp)) err_err = abs(real_err - err) MAE_err = MAE_err + err_err if (pp >= lower) and (pp <= upper): in_bounds = "True" count = count + 1 else: in_bounds = "False" line2_0 = str(ii) + " " + jid[ii] + " " + str(y_test[ii]) + " " line2_1 = line2_0 + str(lower) + " " + str(pp) + " " + str(real_err) line2 = ( line2_1 + " " + str(upper) + " " + str(ss) + " " + str(in_bounds) + "\n" ) fout2.write(line2) print("") print("Number of test materials= " + str(len(actual))) print( "Percentage of in-bound results= " + str((float(count) / (len(actual))) * 100) + "%" ) print(" ") MAE_error = float(MAE_err) / (len(actual)) print("MAE predicted error = " + str(MAE_error)) info["MAE_Error"] = MAE_error return info