Deploying a XGBoost model
import pandas as pd
import numpy as np
import xgboost as xgb
import re
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
np.random.seed(42)
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2)
X_y_train = xgb.DMatrix(data=X_train[["pclass", "age", "fare", "sibsp", "parch"]], label=y_train.astype(int))
X_test = xgb.DMatrix(data=X_test[["pclass", "age", "fare", "sibsp", "parch"]])
X_train[["pclass", "age", "fare", "sibsp", "parch"]].head()
params = {
"base_score": np.mean(y_train.astype(int)),
"eta": 0.1,
"max_depth": 3,
"gamma": 3,
"objective": "reg:squarederror",
"eval_metric": "mae"
}
model = xgb.train(
params=params,
dtrain=X_y_train,
num_boost_round=3
)
xgb.to_graphviz(booster = model, num_trees=0)
xgb.to_graphviz(booster = model, num_trees=1)
xgb.to_graphviz(booster = model, num_trees=2)
print("\n".join(model.get_dump()))
def string_parser(s):
if len(re.findall(r":leaf=", s)) == 0:
out = re.findall(r"[\w.-]+", s)
tabs = re.findall(r"[\t]+", s)
if (out[4] == out[8]):
missing_value_handling = (" or np.isnan(x['" + out[1] + "']) ")
else:
missing_value_handling = ""
if len(tabs) > 0:
return (re.findall(r"[\t]+", s)[0].replace('\t', ' ') +
' if state == ' + out[0] + ':\n' +
re.findall(r"[\t]+", s)[0].replace('\t', ' ') +
' state = (' + out[4] +
' if ' + "x['" + out[1] +"']<" + out[2] + missing_value_handling +
' else ' + out[6] + ')\n' )
else:
return (' if state == ' + out[0] + ':\n' +
' state = (' + out[4] +
' if ' + "x['" + out[1] +"']<" + out[2] + missing_value_handling +
' else ' + out[6] + ')\n' )
else:
out = re.findall(r"[\d.-]+", s)
return (re.findall(r"[\t]+", s)[0].replace('\t', ' ') +
' if state == ' + out[0] + ':\n ' +
re.findall(r"[\t]+", s)[0].replace('\t', ' ') +
' return ' + out[1] + '\n')
def tree_parser(tree, i):
if i == 0:
return (' if num_booster == 0:\n state = 0\n'
+ "".join([string_parser(tree.split('\n')[i]) for i in range(len(tree.split('\n'))-1)]))
else:
return (' elif num_booster == '+str(i)+':\n state = 0\n'
+ "".join([string_parser(tree.split('\n')[i]) for i in range(len(tree.split('\n'))-1)]))
def model_to_py(base_score, model, out_file):
trees = model.get_dump()
result = ["import numpy as np\n\n"
+"def xgb_tree(x, num_booster):\n"]
for i in range(len(trees)):
result.append(tree_parser(trees[i], i))
with open(out_file, 'w') as the_file:
the_file.write("".join(result) + "\ndef xgb_predict(x):\n predict = " + str(base_score) + "\n"
+ "# initialize prediction with base score\n"
+ " for i in range("
+ str(len(trees))
+ "):\n predict = predict + xgb_tree(x, i)"
+ "\n return predict")
model_to_py(params['base_score'], model, 'xgb_model.py')
import xgb_model
passenger_data_1 = {'pclass':3, 'age':np.nan, 'sibsp':0, 'parch':0, 'fare':7.8958}
passenger_data_2 = {'pclass':1, 'age':46, 'sibsp':0, 'parch':0, 'fare':26}
print(xgb_model.xgb_predict(passenger_data_1))
print(xgb_model.xgb_predict(passenger_data_2))