diff --git a/build/lib/pycaret/classification.py b/build/lib/pycaret/classification.py deleted file mode 100644 index e787abb9c7499724ccd393242db1a95abe6d307a..0000000000000000000000000000000000000000 --- a/build/lib/pycaret/classification.py +++ /dev/null @@ -1,12145 +0,0 @@ -# Module: Classification -# Author: Moez Ali -# License: MIT -# Release: PyCaret 2.0x -# Last modified : 30/07/2020 - -def setup(data, - target, - train_size = 0.7, - sampling = True, - sample_estimator = None, - categorical_features = None, - categorical_imputation = 'constant', - ordinal_features = None, - high_cardinality_features = None, - high_cardinality_method = 'frequency', - numeric_features = None, - numeric_imputation = 'mean', - date_features = None, - ignore_features = None, - normalize = False, - normalize_method = 'zscore', - transformation = False, - transformation_method = 'yeo-johnson', - handle_unknown_categorical = True, - unknown_categorical_method = 'least_frequent', - pca = False, - pca_method = 'linear', - pca_components = None, - ignore_low_variance = False, - combine_rare_levels = False, - rare_level_threshold = 0.10, - bin_numeric_features = None, - remove_outliers = False, - outliers_threshold = 0.05, - remove_multicollinearity = False, - multicollinearity_threshold = 0.9, - remove_perfect_collinearity = False, #added in pycaret==2.0.0 - create_clusters = False, - cluster_iter = 20, - polynomial_features = False, - polynomial_degree = 2, - trigonometry_features = False, - polynomial_threshold = 0.1, - group_features = None, - group_names = None, - feature_selection = False, - feature_selection_threshold = 0.8, - feature_interaction = False, - feature_ratio = False, - interaction_threshold = 0.01, - fix_imbalance = False, #added in pycaret==2.0.0 - fix_imbalance_method = None, #added in pycaret==2.0.0 - data_split_shuffle = True, #added in pycaret==2.0.0 - folds_shuffle = False, #added in pycaret==2.0.0 - n_jobs = -1, #added in pycaret==2.0.0 - html = True, #added in pycaret==2.0.0 - session_id = None, - log_experiment = False, #added in pycaret==2.0.0 - experiment_name = None, #added in pycaret==2.0.0 - log_plots = False, #added in pycaret==2.0.0 - log_profile = False, #added in pycaret==2.0.0 - log_data = False, #added in pycaret==2.0.0 - silent=False, - verbose=True, #added in pycaret==2.0.0 - profile = False): - - """ - - Description: - ------------ - This function initializes the environment in pycaret and creates the transformation - pipeline to prepare the data for modeling and deployment. setup() must called before - executing any other function in pycaret. It takes two mandatory parameters: - dataframe {array-like, sparse matrix} and name of the target column. - - All other parameters are optional. - - Example - ------- - from pycaret.datasets import get_data - juice = get_data('juice') - - experiment_name = setup(data = juice, target = 'Purchase') - - 'juice' is a pandas DataFrame and 'Purchase' is the name of target column. - - Parameters - ---------- - data : {array-like, sparse matrix}, shape (n_samples, n_features) where n_samples - is the number of samples and n_features is the number of features. - - target: string - Name of the target column to be passed in as a string. The target variable could - be binary or multiclass. In case of a multiclass target, all estimators are wrapped - with a OneVsRest classifier. - - train_size: float, default = 0.7 - Size of the training set. By default, 70% of the data will be used for training - and validation. The remaining data will be used for a test / hold-out set. - - sampling: bool, default = True - When the sample size exceeds 25,000 samples, pycaret will build a base estimator - at various sample sizes from the original dataset. This will return a performance - plot of AUC, Accuracy, Recall, Precision, Kappa and F1 values at various sample - levels, that will assist in deciding the preferred sample size for modeling. - The desired sample size must then be entered for training and validation in the - pycaret environment. When sample_size entered is less than 1, the remaining dataset - (1 - sample) is used for fitting the model only when finalize_model() is called. - - sample_estimator: object, default = None - If None, Logistic Regression is used by default. - - categorical_features: string, default = None - If the inferred data types are not correct, categorical_features can be used to - overwrite the inferred type. If when running setup the type of 'column1' is - inferred as numeric instead of categorical, then this parameter can be used - to overwrite the type by passing categorical_features = ['column1']. - - categorical_imputation: string, default = 'constant' - If missing values are found in categorical features, they will be imputed with - a constant 'not_available' value. The other available option is 'mode' which - imputes the missing value using most frequent value in the training dataset. - - ordinal_features: dictionary, default = None - When the data contains ordinal features, they must be encoded differently using - the ordinal_features param. If the data has a categorical variable with values - of 'low', 'medium', 'high' and it is known that low < medium < high, then it can - be passed as ordinal_features = { 'column_name' : ['low', 'medium', 'high'] }. - The list sequence must be in increasing order from lowest to highest. - - high_cardinality_features: string, default = None - When the data containts features with high cardinality, they can be compressed - into fewer levels by passing them as a list of column names with high cardinality. - Features are compressed using method defined in high_cardinality_method param. - - high_cardinality_method: string, default = 'frequency' - When method set to 'frequency' it will replace the original value of feature - with the frequency distribution and convert the feature into numeric. Other - available method is 'clustering' which performs the clustering on statistical - attribute of data and replaces the original value of feature with cluster label. - The number of clusters is determined using a combination of Calinski-Harabasz and - Silhouette criterion. - - numeric_features: string, default = None - If the inferred data types are not correct, numeric_features can be used to - overwrite the inferred type. If when running setup the type of 'column1' is - inferred as a categorical instead of numeric, then this parameter can be used - to overwrite by passing numeric_features = ['column1']. - - numeric_imputation: string, default = 'mean' - If missing values are found in numeric features, they will be imputed with the - mean value of the feature. The other available option is 'median' which imputes - the value using the median value in the training dataset. - - date_features: string, default = None - If the data has a DateTime column that is not automatically detected when running - setup, this parameter can be used by passing date_features = 'date_column_name'. - It can work with multiple date columns. Date columns are not used in modeling. - Instead, feature extraction is performed and date columns are dropped from the - dataset. If the date column includes a time stamp, features related to time will - also be extracted. - - ignore_features: string, default = None - If any feature should be ignored for modeling, it can be passed to the param - ignore_features. The ID and DateTime columns when inferred, are automatically - set to ignore for modeling. - - normalize: bool, default = False - When set to True, the feature space is transformed using the normalized_method - param. Generally, linear algorithms perform better with normalized data however, - the results may vary and it is advised to run multiple experiments to evaluate - the benefit of normalization. - - normalize_method: string, default = 'zscore' - Defines the method to be used for normalization. By default, normalize method - is set to 'zscore'. The standard zscore is calculated as z = (x - u) / s. The - other available options are: - - 'minmax' : scales and translates each feature individually such that it is in - the range of 0 - 1. - - 'maxabs' : scales and translates each feature individually such that the maximal - absolute value of each feature will be 1.0. It does not shift/center - the data, and thus does not destroy any sparsity. - - 'robust' : scales and translates each feature according to the Interquartile range. - When the dataset contains outliers, robust scaler often gives better - results. - - transformation: bool, default = False - When set to True, a power transformation is applied to make the data more normal / - Gaussian-like. This is useful for modeling issues related to heteroscedasticity or - other situations where normality is desired. The optimal parameter for stabilizing - variance and minimizing skewness is estimated through maximum likelihood. - - transformation_method: string, default = 'yeo-johnson' - Defines the method for transformation. By default, the transformation method is set - to 'yeo-johnson'. The other available option is 'quantile' transformation. Both - the transformation transforms the feature set to follow a Gaussian-like or normal - distribution. Note that the quantile transformer is non-linear and may distort linear - correlations between variables measured at the same scale. - - handle_unknown_categorical: bool, default = True - When set to True, unknown categorical levels in new / unseen data are replaced by - the most or least frequent level as learned in the training data. The method is - defined under the unknown_categorical_method param. - - unknown_categorical_method: string, default = 'least_frequent' - Method used to replace unknown categorical levels in unseen data. Method can be - set to 'least_frequent' or 'most_frequent'. - - pca: bool, default = False - When set to True, dimensionality reduction is applied to project the data into - a lower dimensional space using the method defined in pca_method param. In - supervised learning pca is generally performed when dealing with high feature - space and memory is a constraint. Note that not all datasets can be decomposed - efficiently using a linear PCA technique and that applying PCA may result in loss - of information. As such, it is advised to run multiple experiments with different - pca_methods to evaluate the impact. - - pca_method: string, default = 'linear' - The 'linear' method performs Linear dimensionality reduction using Singular Value - Decomposition. The other available options are: - - kernel : dimensionality reduction through the use of RVF kernel. - - incremental : replacement for 'linear' pca when the dataset to be decomposed is - too large to fit in memory - - pca_components: int/float, default = 0.99 - Number of components to keep. if pca_components is a float, it is treated as a - target percentage for information retention. When pca_components is an integer - it is treated as the number of features to be kept. pca_components must be strictly - less than the original number of features in the dataset. - - ignore_low_variance: bool, default = False - When set to True, all categorical features with statistically insignificant variances - are removed from the dataset. The variance is calculated using the ratio of unique - values to the number of samples, and the ratio of the most common value to the - frequency of the second most common value. - - combine_rare_levels: bool, default = False - When set to True, all levels in categorical features below the threshold defined - in rare_level_threshold param are combined together as a single level. There must be - atleast two levels under the threshold for this to take effect. rare_level_threshold - represents the percentile distribution of level frequency. Generally, this technique - is applied to limit a sparse matrix caused by high numbers of levels in categorical - features. - - rare_level_threshold: float, default = 0.1 - Percentile distribution below which rare categories are combined. Only comes into - effect when combine_rare_levels is set to True. - - bin_numeric_features: list, default = None - When a list of numeric features is passed they are transformed into categorical - features using KMeans, where values in each bin have the same nearest center of a - 1D k-means cluster. The number of clusters are determined based on the 'sturges' - method. It is only optimal for gaussian data and underestimates the number of bins - for large non-gaussian datasets. - - remove_outliers: bool, default = False - When set to True, outliers from the training data are removed using PCA linear - dimensionality reduction using the Singular Value Decomposition technique. - - outliers_threshold: float, default = 0.05 - The percentage / proportion of outliers in the dataset can be defined using - the outliers_threshold param. By default, 0.05 is used which means 0.025 of the - values on each side of the distribution's tail are dropped from training data. - - remove_multicollinearity: bool, default = False - When set to True, the variables with inter-correlations higher than the threshold - defined under the multicollinearity_threshold param are dropped. When two features - are highly correlated with each other, the feature that is less correlated with - the target variable is dropped. - - multicollinearity_threshold: float, default = 0.9 - Threshold used for dropping the correlated features. Only comes into effect when - remove_multicollinearity is set to True. - - remove_perfect_collinearity: bool, default = False - When set to True, perfect collinearity (features with correlation = 1) is removed - from the dataset, When two features are 100% correlated, one of it is randomly - dropped from the dataset. - - create_clusters: bool, default = False - When set to True, an additional feature is created where each instance is assigned - to a cluster. The number of clusters is determined using a combination of - Calinski-Harabasz and Silhouette criterion. - - cluster_iter: int, default = 20 - Number of iterations used to create a cluster. Each iteration represents cluster - size. Only comes into effect when create_clusters param is set to True. - - polynomial_features: bool, default = False - When set to True, new features are created based on all polynomial combinations - that exist within the numeric features in a dataset to the degree defined in - polynomial_degree param. - - polynomial_degree: int, default = 2 - Degree of polynomial features. For example, if an input sample is two dimensional - and of the form [a, b], the polynomial features with degree = 2 are: - [1, a, b, a^2, ab, b^2]. - - trigonometry_features: bool, default = False - When set to True, new features are created based on all trigonometric combinations - that exist within the numeric features in a dataset to the degree defined in the - polynomial_degree param. - - polynomial_threshold: float, default = 0.1 - This is used to compress a sparse matrix of polynomial and trigonometric features. - Polynomial and trigonometric features whose feature importance based on the - combination of Random Forest, AdaBoost and Linear correlation falls within the - percentile of the defined threshold are kept in the dataset. Remaining features - are dropped before further processing. - - group_features: list or list of list, default = None - When a dataset contains features that have related characteristics, the group_features - param can be used for statistical feature extraction. For example, if a dataset has - numeric features that are related with each other (i.e 'Col1', 'Col2', 'Col3'), a list - containing the column names can be passed under group_features to extract statistical - information such as the mean, median, mode and standard deviation. - - group_names: list, default = None - When group_features is passed, a name of the group can be passed into the group_names - param as a list containing strings. The length of a group_names list must equal to the - length of group_features. When the length doesn't match or the name is not passed, new - features are sequentially named such as group_1, group_2 etc. - - feature_selection: bool, default = False - When set to True, a subset of features are selected using a combination of various - permutation importance techniques including Random Forest, Adaboost and Linear - correlation with target variable. The size of the subset is dependent on the - feature_selection_param. Generally, this is used to constrain the feature space - in order to improve efficiency in modeling. When polynomial_features and - feature_interaction are used, it is highly recommended to define the - feature_selection_threshold param with a lower value. - - feature_selection_threshold: float, default = 0.8 - Threshold used for feature selection (including newly created polynomial features). - A higher value will result in a higher feature space. It is recommended to do multiple - trials with different values of feature_selection_threshold specially in cases where - polynomial_features and feature_interaction are used. Setting a very low value may be - efficient but could result in under-fitting. - - feature_interaction: bool, default = False - When set to True, it will create new features by interacting (a * b) for all numeric - variables in the dataset including polynomial and trigonometric features (if created). - This feature is not scalable and may not work as expected on datasets with large - feature space. - - feature_ratio: bool, default = False - When set to True, it will create new features by calculating the ratios (a / b) of all - numeric variables in the dataset. This feature is not scalable and may not work as - expected on datasets with large feature space. - - interaction_threshold: bool, default = 0.01 - Similar to polynomial_threshold, It is used to compress a sparse matrix of newly - created features through interaction. Features whose importance based on the - combination of Random Forest, AdaBoost and Linear correlation falls within the - percentile of the defined threshold are kept in the dataset. Remaining features - are dropped before further processing. - - fix_imbalance: bool, default = False - When dataset has unequal distribution of target class it can be fixed using - fix_imbalance parameter. When set to True, SMOTE (Synthetic Minority Over-sampling - Technique) is applied by default to create synthetic datapoints for minority class. - - fix_imbalance_method: obj, default = None - When fix_imbalance is set to True and fix_imbalance_method is None, 'smote' is applied - by default to oversample minority class during cross validation. This parameter - accepts any module from 'imblearn' that supports 'fit_resample' method. - - data_split_shuffle: bool, default = True - If set to False, prevents shuffling of rows when splitting data. - - folds_shuffle: bool, default = False - If set to False, prevents shuffling of rows when using cross validation. - - n_jobs: int, default = -1 - The number of jobs to run in parallel (for functions that supports parallel - processing) -1 means using all processors. To run all functions on single processor - set n_jobs to None. - - html: bool, default = True - If set to False, prevents runtime display of monitor. This must be set to False - when using environment that doesnt support HTML. - - session_id: int, default = None - If None, a random seed is generated and returned in the Information grid. The - unique number is then distributed as a seed in all functions used during the - experiment. This can be used for later reproducibility of the entire experiment. - - log_experiment: bool, default = False - When set to True, all metrics and parameters are logged on MLFlow server. - - experiment_name: str, default = None - Name of experiment for logging. When set to None, 'clf' is by default used as - alias for the experiment name. - - log_plots: bool, default = False - When set to True, specific plots are logged in MLflow as a png file. By default, - it is set to False. - - log_profile: bool, default = False - When set to True, data profile is also logged on MLflow as a html file. By default, - it is set to False. - - log_data: bool, default = False - When set to True, train and test dataset are logged as csv. - - silent: bool, default = False - When set to True, confirmation of data types is not required. All preprocessing will - be performed assuming automatically inferred data types. Not recommended for direct use - except for established pipelines. - - verbose: Boolean, default = True - Information grid is not printed when verbose is set to False. - - profile: bool, default = False - If set to true, a data profile for Exploratory Data Analysis will be displayed - in an interactive HTML report. - - Returns: - -------- - - info grid: Information grid is printed. - ----------- - - environment: This function returns various outputs that are stored in variables - ----------- as tuples. They are used by other functions in pycaret. - - - """ - - #ignore warnings - import warnings - warnings.filterwarnings('ignore') - - #exception checking - import sys - - from pycaret.utils import __version__ - ver = __version__() - - import logging - - # create logger - global logger - - logger = logging.getLogger('logs') - logger.setLevel(logging.DEBUG) - - # create console handler and set level to debug - - if logger.hasHandlers(): - logger.handlers.clear() - - ch = logging.FileHandler('logs.log') - ch.setLevel(logging.DEBUG) - - # create formatter - formatter = logging.Formatter('%(asctime)s:%(levelname)s:%(message)s') - - # add formatter to ch - ch.setFormatter(formatter) - - # add ch to logger - logger.addHandler(ch) - - logger.info("PyCaret Classification Module") - logger.info('version ' + str(ver)) - logger.info("Initializing setup()") - - #generate USI for mlflow tracking - import secrets - global USI - USI = secrets.token_hex(nbytes=2) - logger.info('USI: ' + str(USI)) - - logger.info("""setup(data={}, target={}, train_size={}, sampling={}, sample_estimator={}, categorical_features={}, categorical_imputation={}, ordinal_features={}, - high_cardinality_features={}, high_cardinality_method={}, numeric_features={}, numeric_imputation={}, date_features={}, ignore_features={}, normalize={}, - normalize_method={}, transformation={}, transformation_method={}, handle_unknown_categorical={}, unknown_categorical_method={}, pca={}, pca_method={}, - pca_components={}, ignore_low_variance={}, combine_rare_levels={}, rare_level_threshold={}, bin_numeric_features={}, remove_outliers={}, outliers_threshold={}, - remove_multicollinearity={}, multicollinearity_threshold={}, remove_perfect_collinearity={}, create_clusters={}, cluster_iter={}, - polynomial_features={}, polynomial_degree={}, trigonometry_features={}, polynomial_threshold={}, group_features={}, - group_names={}, feature_selection={}, feature_selection_threshold={}, feature_interaction={}, feature_ratio={}, interaction_threshold={}, - fix_imbalance={}, fix_imbalance_method={}, data_split_shuffle={}, folds_shuffle={}, n_jobs={}, html={}, session_id={}, log_experiment={}, - experiment_name={}, log_plots={}, log_profile={}, log_data={}, silent={}, verbose={}, profile={})""".format(\ - str(data.shape), str(target), str(train_size), str(sampling), str(sample_estimator), str(categorical_features), str(categorical_imputation), str(ordinal_features),\ - str(high_cardinality_features), str(high_cardinality_method), str(numeric_features), str(numeric_imputation), str(date_features), str(ignore_features),\ - str(normalize), str(normalize_method), str(transformation), str(transformation_method), str(handle_unknown_categorical), str(unknown_categorical_method), str(pca),\ - str(pca_method), str(pca_components), str(ignore_low_variance), str(combine_rare_levels), str(rare_level_threshold), str(bin_numeric_features), str(remove_outliers),\ - str(outliers_threshold), str(remove_multicollinearity), str(multicollinearity_threshold), str(remove_perfect_collinearity), str(create_clusters), str(cluster_iter),\ - str(polynomial_features), str(polynomial_degree), str(trigonometry_features), str(polynomial_threshold), str(group_features), str(group_names),\ - str(feature_selection), str(feature_selection_threshold), str(feature_interaction), str(feature_ratio), str(interaction_threshold), str(fix_imbalance),\ - str(fix_imbalance_method), str(data_split_shuffle), str(folds_shuffle), str(n_jobs), str(html), str(session_id), str(log_experiment), str(experiment_name),\ - str(log_plots), str(log_profile), str(log_data), str(silent), str(verbose), str(profile))) - - #logging environment and libraries - logger.info("Checking environment") - - from platform import python_version, platform, python_build, machine - - logger.info("python_version: " + str(python_version())) - logger.info("python_build: " + str(python_build())) - logger.info("machine: " + str(machine())) - logger.info("platform: " + str(platform())) - - try: - import psutil - logger.info("Memory: " + str(psutil.virtual_memory())) - logger.info("Physical Core: " + str(psutil.cpu_count(logical=False))) - logger.info("Logical Core: " + str(psutil.cpu_count(logical=True))) - except: - logger.warning("cannot find psutil installation. memory not traceable. Install psutil using pip to enable memory logging. ") - - logger.info("Checking libraries") - - try: - from pandas import __version__ - logger.info("pd==" + str(__version__)) - except: - logger.warning("pandas not found") - - try: - from numpy import __version__ - logger.info("numpy==" + str(__version__)) - except: - logger.warning("numpy not found") - - try: - from sklearn import __version__ - logger.info("sklearn==" + str(__version__)) - except: - logger.warning("sklearn not found") - - try: - from xgboost import __version__ - logger.info("xgboost==" + str(__version__)) - except: - logger.warning("xgboost not found") - - try: - from lightgbm import __version__ - logger.info("lightgbm==" + str(__version__)) - except: - logger.warning("lightgbm not found") - - try: - from catboost import __version__ - logger.info("catboost==" + str(__version__)) - except: - logger.warning("catboost not found") - - try: - from mlflow.version import VERSION - import warnings - warnings.filterwarnings('ignore') - logger.info("mlflow==" + str(VERSION)) - except: - logger.warning("mlflow not found") - - #run_time - import datetime, time - runtime_start = time.time() - - logger.info("Checking Exceptions") - - #checking train size parameter - if type(train_size) is not float: - sys.exit('(Type Error): train_size parameter only accepts float value.') - - #checking sampling parameter - if type(sampling) is not bool: - sys.exit('(Type Error): sampling parameter only accepts True or False.') - - #checking sampling parameter - if target not in data.columns: - sys.exit('(Value Error): Target parameter doesnt exist in the data provided.') - - #checking session_id - if session_id is not None: - if type(session_id) is not int: - sys.exit('(Type Error): session_id parameter must be an integer.') - - #checking sampling parameter - if type(profile) is not bool: - sys.exit('(Type Error): profile parameter only accepts True or False.') - - #checking normalize parameter - if type(normalize) is not bool: - sys.exit('(Type Error): normalize parameter only accepts True or False.') - - #checking transformation parameter - if type(transformation) is not bool: - sys.exit('(Type Error): transformation parameter only accepts True or False.') - - #checking categorical imputation - allowed_categorical_imputation = ['constant', 'mode'] - if categorical_imputation not in allowed_categorical_imputation: - sys.exit("(Value Error): categorical_imputation param only accepts 'constant' or 'mode' ") - - #ordinal_features - if ordinal_features is not None: - if type(ordinal_features) is not dict: - sys.exit("(Type Error): ordinal_features must be of type dictionary with column name as key and ordered values as list. ") - - #ordinal features check - if ordinal_features is not None: - data_cols = data.columns - data_cols = data_cols.drop(target) - ord_keys = ordinal_features.keys() - - for i in ord_keys: - if i not in data_cols: - sys.exit("(Value Error) Column name passed as a key in ordinal_features param doesnt exist. ") - - for k in ord_keys: - if data[k].nunique() != len(ordinal_features.get(k)): - sys.exit("(Value Error) Levels passed in ordinal_features param doesnt match with levels in data. ") - - for i in ord_keys: - value_in_keys = ordinal_features.get(i) - value_in_data = list(data[i].unique().astype(str)) - for j in value_in_keys: - if j not in value_in_data: - text = "Column name '" + str(i) + "' doesnt contain any level named '" + str(j) + "'." - sys.exit(text) - - #high_cardinality_features - if high_cardinality_features is not None: - if type(high_cardinality_features) is not list: - sys.exit("(Type Error): high_cardinality_features param only accepts name of columns as a list. ") - - if high_cardinality_features is not None: - data_cols = data.columns - data_cols = data_cols.drop(target) - for i in high_cardinality_features: - if i not in data_cols: - sys.exit("(Value Error): Column type forced is either target column or doesn't exist in the dataset.") - - #high_cardinality_methods - high_cardinality_allowed_methods = ['frequency', 'clustering'] - if high_cardinality_method not in high_cardinality_allowed_methods: - sys.exit("(Value Error): high_cardinality_method param only accepts 'frequency' or 'clustering' ") - - #checking numeric imputation - allowed_numeric_imputation = ['mean', 'median'] - if numeric_imputation not in allowed_numeric_imputation: - sys.exit("(Value Error): numeric_imputation param only accepts 'mean' or 'median' ") - - #checking normalize method - allowed_normalize_method = ['zscore', 'minmax', 'maxabs', 'robust'] - if normalize_method not in allowed_normalize_method: - sys.exit("(Value Error): normalize_method param only accepts 'zscore', 'minxmax', 'maxabs' or 'robust'. ") - - #checking transformation method - allowed_transformation_method = ['yeo-johnson', 'quantile'] - if transformation_method not in allowed_transformation_method: - sys.exit("(Value Error): transformation_method param only accepts 'yeo-johnson' or 'quantile'. ") - - #handle unknown categorical - if type(handle_unknown_categorical) is not bool: - sys.exit('(Type Error): handle_unknown_categorical parameter only accepts True or False.') - - #unknown categorical method - unknown_categorical_method_available = ['least_frequent', 'most_frequent'] - - if unknown_categorical_method not in unknown_categorical_method_available: - sys.exit("(Type Error): unknown_categorical_method only accepts 'least_frequent' or 'most_frequent'.") - - #check pca - if type(pca) is not bool: - sys.exit('(Type Error): PCA parameter only accepts True or False.') - - #pca method check - allowed_pca_methods = ['linear', 'kernel', 'incremental'] - if pca_method not in allowed_pca_methods: - sys.exit("(Value Error): pca method param only accepts 'linear', 'kernel', or 'incremental'. ") - - #pca components check - if pca is True: - if pca_method != 'linear': - if pca_components is not None: - if(type(pca_components)) is not int: - sys.exit("(Type Error): pca_components parameter must be integer when pca_method is not 'linear'. ") - - #pca components check 2 - if pca is True: - if pca_method != 'linear': - if pca_components is not None: - if pca_components > len(data.columns)-1: - sys.exit("(Type Error): pca_components parameter cannot be greater than original features space.") - - #pca components check 3 - if pca is True: - if pca_method == 'linear': - if pca_components is not None: - if type(pca_components) is not float: - if pca_components > len(data.columns)-1: - sys.exit("(Type Error): pca_components parameter cannot be greater than original features space or float between 0 - 1.") - - #check ignore_low_variance - if type(ignore_low_variance) is not bool: - sys.exit('(Type Error): ignore_low_variance parameter only accepts True or False.') - - #check ignore_low_variance - if type(combine_rare_levels) is not bool: - sys.exit('(Type Error): combine_rare_levels parameter only accepts True or False.') - - #check rare_level_threshold - if type(rare_level_threshold) is not float: - sys.exit('(Type Error): rare_level_threshold must be a float between 0 and 1. ') - - #bin numeric features - if bin_numeric_features is not None: - all_cols = list(data.columns) - all_cols.remove(target) - - for i in bin_numeric_features: - if i not in all_cols: - sys.exit("(Value Error): Column type forced is either target column or doesn't exist in the dataset.") - - #remove_outliers - if type(remove_outliers) is not bool: - sys.exit('(Type Error): remove_outliers parameter only accepts True or False.') - - #outliers_threshold - if type(outliers_threshold) is not float: - sys.exit('(Type Error): outliers_threshold must be a float between 0 and 1. ') - - #remove_multicollinearity - if type(remove_multicollinearity) is not bool: - sys.exit('(Type Error): remove_multicollinearity parameter only accepts True or False.') - - #multicollinearity_threshold - if type(multicollinearity_threshold) is not float: - sys.exit('(Type Error): multicollinearity_threshold must be a float between 0 and 1. ') - - #create_clusters - if type(create_clusters) is not bool: - sys.exit('(Type Error): create_clusters parameter only accepts True or False.') - - #cluster_iter - if type(cluster_iter) is not int: - sys.exit('(Type Error): cluster_iter must be a integer greater than 1. ') - - #polynomial_features - if type(polynomial_features) is not bool: - sys.exit('(Type Error): polynomial_features only accepts True or False. ') - - #polynomial_degree - if type(polynomial_degree) is not int: - sys.exit('(Type Error): polynomial_degree must be an integer. ') - - #polynomial_features - if type(trigonometry_features) is not bool: - sys.exit('(Type Error): trigonometry_features only accepts True or False. ') - - #polynomial threshold - if type(polynomial_threshold) is not float: - sys.exit('(Type Error): polynomial_threshold must be a float between 0 and 1. ') - - #group features - if group_features is not None: - if type(group_features) is not list: - sys.exit('(Type Error): group_features must be of type list. ') - - if group_names is not None: - if type(group_names) is not list: - sys.exit('(Type Error): group_names must be of type list. ') - - #cannot drop target - if ignore_features is not None: - if target in ignore_features: - sys.exit("(Value Error): cannot drop target column. ") - - #feature_selection - if type(feature_selection) is not bool: - sys.exit('(Type Error): feature_selection only accepts True or False. ') - - #feature_selection_threshold - if type(feature_selection_threshold) is not float: - sys.exit('(Type Error): feature_selection_threshold must be a float between 0 and 1. ') - - #feature_interaction - if type(feature_interaction) is not bool: - sys.exit('(Type Error): feature_interaction only accepts True or False. ') - - #feature_ratio - if type(feature_ratio) is not bool: - sys.exit('(Type Error): feature_ratio only accepts True or False. ') - - #interaction_threshold - if type(interaction_threshold) is not float: - sys.exit('(Type Error): interaction_threshold must be a float between 0 and 1. ') - - - #forced type check - all_cols = list(data.columns) - all_cols.remove(target) - - #categorical - if categorical_features is not None: - for i in categorical_features: - if i not in all_cols: - sys.exit("(Value Error): Column type forced is either target column or doesn't exist in the dataset.") - - #numeric - if numeric_features is not None: - for i in numeric_features: - if i not in all_cols: - sys.exit("(Value Error): Column type forced is either target column or doesn't exist in the dataset.") - - #date features - if date_features is not None: - for i in date_features: - if i not in all_cols: - sys.exit("(Value Error): Column type forced is either target column or doesn't exist in the dataset.") - - #drop features - if ignore_features is not None: - for i in ignore_features: - if i not in all_cols: - sys.exit("(Value Error): Feature ignored is either target column or doesn't exist in the dataset.") - - #log_experiment - if type(log_experiment) is not bool: - sys.exit("(Type Error): log_experiment parameter only accepts True or False. ") - - #log_profile - if type(log_profile) is not bool: - sys.exit("(Type Error): log_profile parameter only accepts True or False. ") - - #experiment_name - if experiment_name is not None: - if type(experiment_name) is not str: - sys.exit("(Type Error): experiment_name parameter must be string if not None. ") - - #silent - if type(silent) is not bool: - sys.exit("(Type Error): silent parameter only accepts True or False. ") - - #remove_perfect_collinearity - if type(remove_perfect_collinearity) is not bool: - sys.exit('(Type Error): remove_perfect_collinearity parameter only accepts True or False.') - - #html - if type(html) is not bool: - sys.exit('(Type Error): html parameter only accepts True or False.') - - #folds_shuffle - if type(folds_shuffle) is not bool: - sys.exit('(Type Error): folds_shuffle parameter only accepts True or False.') - - #data_split_shuffle - if type(data_split_shuffle) is not bool: - sys.exit('(Type Error): data_split_shuffle parameter only accepts True or False.') - - #log_plots - if type(log_plots) is not bool: - sys.exit('(Type Error): log_plots parameter only accepts True or False.') - - #log_data - if type(log_data) is not bool: - sys.exit('(Type Error): log_data parameter only accepts True or False.') - - #log_profile - if type(log_profile) is not bool: - sys.exit('(Type Error): log_profile parameter only accepts True or False.') - - #fix_imbalance - if type(fix_imbalance) is not bool: - sys.exit('(Type Error): fix_imbalance parameter only accepts True or False.') - - #fix_imbalance_method - if fix_imbalance: - if fix_imbalance_method is not None: - if hasattr(fix_imbalance_method, 'fit_sample'): - pass - else: - sys.exit('(Type Error): fix_imbalance_method must contain resampler with fit_sample method.') - - logger.info("Preloading libraries") - - #pre-load libraries - import pandas as pd - import ipywidgets as ipw - from IPython.display import display, HTML, clear_output, update_display - import os - - #pandas option - pd.set_option('display.max_columns', 500) - pd.set_option('display.max_rows', 500) - - #global html_param - global html_param - - #create html_param - html_param = html - - #silent parameter to also set sampling to False - if silent: - sampling = False - - logger.info("Preparing display monitor") - - #progress bar - if sampling: - max_steps = 10 + 3 - else: - max_steps = 3 - - progress = ipw.IntProgress(value=0, min=0, max=max_steps, step=1 , description='Processing: ') - if verbose: - if html_param: - display(progress) - - timestampStr = datetime.datetime.now().strftime("%H:%M:%S") - monitor = pd.DataFrame( [ ['Initiated' , '. . . . . . . . . . . . . . . . . .', timestampStr ], - ['Status' , '. . . . . . . . . . . . . . . . . .' , 'Loading Dependencies' ], - ['ETC' , '. . . . . . . . . . . . . . . . . .', 'Calculating ETC'] ], - columns=['', ' ', ' ']).set_index('') - - if verbose: - if html_param: - display(monitor, display_id = 'monitor') - - logger.info("Importing libraries") - - #general dependencies - import numpy as np - from sklearn.linear_model import LogisticRegression - from sklearn.model_selection import train_test_split - from sklearn import metrics - import random - import seaborn as sns - import matplotlib.pyplot as plt - import plotly.express as px - - #setting sklearn config to print all parameters including default - import sklearn - sklearn.set_config(print_changed_only=False) - - #define highlight function for function grid to display - def highlight_max(s): - is_max = s == True - return ['background-color: lightgreen' if v else '' for v in is_max] - - #cufflinks - import cufflinks as cf - cf.go_offline() - cf.set_config_file(offline=False, world_readable=True) - - logger.info("Copying data for preprocessing") - - #copy original data for pandas profiler - data_before_preprocess = data.copy() - - logger.info("Declaring global variables") - - #declaring global variables to be accessed by other functions - global X, y, X_train, X_test, y_train, y_test, seed, prep_pipe, experiment__,\ - folds_shuffle_param, n_jobs_param, create_model_container, master_model_container,\ - display_container, exp_name_log, logging_param, log_plots_param,\ - fix_imbalance_param, fix_imbalance_method_param - - #generate seed to be used globally - if session_id is None: - seed = random.randint(150,9000) - else: - seed = session_id - - """ - preprocessing starts here - """ - - monitor.iloc[1,1:] = 'Preparing Data for Modeling' - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - #define parameters for preprocessor - - logger.info("Declaring preprocessing parameters") - - #categorical features - if categorical_features is None: - cat_features_pass = [] - else: - cat_features_pass = categorical_features - - #numeric features - if numeric_features is None: - numeric_features_pass = [] - else: - numeric_features_pass = numeric_features - - #drop features - if ignore_features is None: - ignore_features_pass = [] - else: - ignore_features_pass = ignore_features - - #date features - if date_features is None: - date_features_pass = [] - else: - date_features_pass = date_features - - #categorical imputation strategy - if categorical_imputation == 'constant': - categorical_imputation_pass = 'not_available' - elif categorical_imputation == 'mode': - categorical_imputation_pass = 'most frequent' - - #transformation method strategy - if transformation_method == 'yeo-johnson': - trans_method_pass = 'yj' - elif transformation_method == 'quantile': - trans_method_pass = 'quantile' - - #pass method - if pca_method == 'linear': - pca_method_pass = 'pca_liner' - - elif pca_method == 'kernel': - pca_method_pass = 'pca_kernal' - - elif pca_method == 'incremental': - pca_method_pass = 'incremental' - - elif pca_method == 'pls': - pca_method_pass = 'pls' - - #pca components - if pca is True: - if pca_components is None: - if pca_method == 'linear': - pca_components_pass = 0.99 - else: - pca_components_pass = int((len(data.columns)-1)*0.5) - - else: - pca_components_pass = pca_components - - else: - pca_components_pass = 0.99 - - if bin_numeric_features is None: - apply_binning_pass = False - features_to_bin_pass = [] - - else: - apply_binning_pass = True - features_to_bin_pass = bin_numeric_features - - #trignometry - if trigonometry_features is False: - trigonometry_features_pass = [] - else: - trigonometry_features_pass = ['sin', 'cos', 'tan'] - - #group features - #=============# - - #apply grouping - if group_features is not None: - apply_grouping_pass = True - else: - apply_grouping_pass = False - - #group features listing - if apply_grouping_pass is True: - - if type(group_features[0]) is str: - group_features_pass = [] - group_features_pass.append(group_features) - else: - group_features_pass = group_features - - else: - - group_features_pass = [[]] - - #group names - if apply_grouping_pass is True: - - if (group_names is None) or (len(group_names) != len(group_features_pass)): - group_names_pass = list(np.arange(len(group_features_pass))) - group_names_pass = ['group_' + str(i) for i in group_names_pass] - - else: - group_names_pass = group_names - - else: - group_names_pass = [] - - #feature interactions - - if feature_interaction or feature_ratio: - apply_feature_interactions_pass = True - else: - apply_feature_interactions_pass = False - - interactions_to_apply_pass = [] - - if feature_interaction: - interactions_to_apply_pass.append('multiply') - - if feature_ratio: - interactions_to_apply_pass.append('divide') - - #unknown categorical - if unknown_categorical_method == 'least_frequent': - unknown_categorical_method_pass = 'least frequent' - elif unknown_categorical_method == 'most_frequent': - unknown_categorical_method_pass = 'most frequent' - - #ordinal_features - if ordinal_features is not None: - apply_ordinal_encoding_pass = True - else: - apply_ordinal_encoding_pass = False - - if apply_ordinal_encoding_pass is True: - ordinal_columns_and_categories_pass = ordinal_features - else: - ordinal_columns_and_categories_pass = {} - - if high_cardinality_features is not None: - apply_cardinality_reduction_pass = True - else: - apply_cardinality_reduction_pass = False - - if high_cardinality_method == 'frequency': - cardinal_method_pass = 'count' - elif high_cardinality_method == 'clustering': - cardinal_method_pass = 'cluster' - - if apply_cardinality_reduction_pass: - cardinal_features_pass = high_cardinality_features - else: - cardinal_features_pass = [] - - if silent: - display_dtypes_pass = False - else: - display_dtypes_pass = True - - logger.info("Importing preprocessing module") - - #import library - import pycaret.preprocess as preprocess - - logger.info("Creating preprocessing pipeline") - - data = preprocess.Preprocess_Path_One(train_data = data, - target_variable = target, - categorical_features = cat_features_pass, - apply_ordinal_encoding = apply_ordinal_encoding_pass, - ordinal_columns_and_categories = ordinal_columns_and_categories_pass, - apply_cardinality_reduction = apply_cardinality_reduction_pass, - cardinal_method = cardinal_method_pass, - cardinal_features = cardinal_features_pass, - numerical_features = numeric_features_pass, - time_features = date_features_pass, - features_todrop = ignore_features_pass, - numeric_imputation_strategy = numeric_imputation, - categorical_imputation_strategy = categorical_imputation_pass, - scale_data = normalize, - scaling_method = normalize_method, - Power_transform_data = transformation, - Power_transform_method = trans_method_pass, - apply_untrained_levels_treatment= handle_unknown_categorical, - untrained_levels_treatment_method = unknown_categorical_method_pass, - apply_pca = pca, - pca_method = pca_method_pass, - pca_variance_retained_or_number_of_components = pca_components_pass, - apply_zero_nearZero_variance = ignore_low_variance, - club_rare_levels = combine_rare_levels, - rara_level_threshold_percentage = rare_level_threshold, - apply_binning = apply_binning_pass, - features_to_binn = features_to_bin_pass, - remove_outliers = remove_outliers, - outlier_contamination_percentage = outliers_threshold, - outlier_methods = ['pca'], - remove_multicollinearity = remove_multicollinearity, - maximum_correlation_between_features = multicollinearity_threshold, - remove_perfect_collinearity = remove_perfect_collinearity, - cluster_entire_data = create_clusters, - range_of_clusters_to_try = cluster_iter, - apply_polynomial_trigonometry_features = polynomial_features, - max_polynomial = polynomial_degree, - trigonometry_calculations = trigonometry_features_pass, - top_poly_trig_features_to_select_percentage = polynomial_threshold, - apply_grouping = apply_grouping_pass, - features_to_group_ListofList = group_features_pass, - group_name = group_names_pass, - apply_feature_selection = feature_selection, - feature_selection_top_features_percentage = feature_selection_threshold, - apply_feature_interactions = apply_feature_interactions_pass, - feature_interactions_to_apply = interactions_to_apply_pass, - feature_interactions_top_features_to_select_percentage=interaction_threshold, - display_types = display_dtypes_pass, #this is for inferred input box - target_transformation = False, #not needed for classification - random_state = seed) - - progress.value += 1 - logger.info("Preprocessing pipeline created successfully") - - if hasattr(preprocess.dtypes, 'replacement'): - label_encoded = preprocess.dtypes.replacement - label_encoded = str(label_encoded).replace("'", '') - label_encoded = str(label_encoded).replace("{", '') - label_encoded = str(label_encoded).replace("}", '') - - else: - label_encoded = 'None' - - try: - res_type = ['quit','Quit','exit','EXIT','q','Q','e','E','QUIT','Exit'] - res = preprocess.dtypes.response - - if res in res_type: - sys.exit("(Process Exit): setup has been interupted with user command 'quit'. setup must rerun." ) - - except: - logger.error("(Process Exit): setup has been interupted with user command 'quit'. setup must rerun.") - - #save prep pipe - prep_pipe = preprocess.pipe - - logger.info("Creating grid variables") - - #generate values for grid show - missing_values = data_before_preprocess.isna().sum().sum() - if missing_values > 0: - missing_flag = True - else: - missing_flag = False - - if normalize is True: - normalize_grid = normalize_method - else: - normalize_grid = 'None' - - if transformation is True: - transformation_grid = transformation_method - else: - transformation_grid = 'None' - - if pca is True: - pca_method_grid = pca_method - else: - pca_method_grid = 'None' - - if pca is True: - pca_components_grid = pca_components_pass - else: - pca_components_grid = 'None' - - if combine_rare_levels: - rare_level_threshold_grid = rare_level_threshold - else: - rare_level_threshold_grid = 'None' - - if bin_numeric_features is None: - numeric_bin_grid = False - else: - numeric_bin_grid = True - - if remove_outliers is False: - outliers_threshold_grid = None - else: - outliers_threshold_grid = outliers_threshold - - if remove_multicollinearity is False: - multicollinearity_threshold_grid = None - else: - multicollinearity_threshold_grid = multicollinearity_threshold - - if create_clusters is False: - cluster_iter_grid = None - else: - cluster_iter_grid = cluster_iter - - if polynomial_features: - polynomial_degree_grid = polynomial_degree - else: - polynomial_degree_grid = None - - if polynomial_features or trigonometry_features: - polynomial_threshold_grid = polynomial_threshold - else: - polynomial_threshold_grid = None - - if feature_selection: - feature_selection_threshold_grid = feature_selection_threshold - else: - feature_selection_threshold_grid = None - - if feature_interaction or feature_ratio: - interaction_threshold_grid = interaction_threshold - else: - interaction_threshold_grid = None - - if ordinal_features is not None: - ordinal_features_grid = True - else: - ordinal_features_grid = False - - if handle_unknown_categorical: - unknown_categorical_method_grid = unknown_categorical_method - else: - unknown_categorical_method_grid = None - - if group_features is not None: - group_features_grid = True - else: - group_features_grid = False - - if high_cardinality_features is not None: - high_cardinality_features_grid = True - else: - high_cardinality_features_grid = False - - if high_cardinality_features_grid: - high_cardinality_method_grid = high_cardinality_method - else: - high_cardinality_method_grid = None - - learned_types = preprocess.dtypes.learent_dtypes - learned_types.drop(target, inplace=True) - - float_type = 0 - cat_type = 0 - - for i in preprocess.dtypes.learent_dtypes: - if 'float' in str(i): - float_type += 1 - elif 'object' in str(i): - cat_type += 1 - elif 'int' in str(i): - float_type += 1 - - """ - preprocessing ends here - """ - - #reset pandas option - pd.reset_option("display.max_rows") - pd.reset_option("display.max_columns") - - logger.info("Creating global containers") - - #create an empty list for pickling later. - experiment__ = [] - - #create folds_shuffle_param - folds_shuffle_param = folds_shuffle - - #create n_jobs_param - n_jobs_param = n_jobs - - #create create_model_container - create_model_container = [] - - #create master_model_container - master_model_container = [] - - #create display container - display_container = [] - - #create logging parameter - logging_param = log_experiment - - #create exp_name_log param incase logging is False - exp_name_log = 'no_logging' - - #create an empty log_plots_param - if log_plots: - log_plots_param = True - else: - log_plots_param = False - - #create a fix_imbalance_param and fix_imbalance_method_param - fix_imbalance_param = fix_imbalance - fix_imbalance_method_param = fix_imbalance_method - - if fix_imbalance_method_param is None: - fix_imbalance_model_name = 'SMOTE' - else: - fix_imbalance_model_name = str(fix_imbalance_method_param).split("(")[0] - - #sample estimator - if sample_estimator is None: - model = LogisticRegression() - else: - model = sample_estimator - - model_name = str(model).split("(")[0] - if 'CatBoostClassifier' in model_name: - model_name = 'CatBoostClassifier' - - #creating variables to be used later in the function - X = data.drop(target,axis=1) - y = data[target] - - #determining target type - if y.value_counts().count() > 2: - target_type = 'Multiclass' - else: - target_type = 'Binary' - - progress.value += 1 - - if sampling is True and data.shape[0] > 25000: #change this back to 25000 - - logger.info("Sampling dataset") - - split_perc = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.99] - split_perc_text = ['10%','20%','30%','40%','50%','60%', '70%', '80%', '90%', '100%'] - split_perc_tt = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.99] - split_perc_tt_total = [] - split_percent = [] - - metric_results = [] - metric_name = [] - - counter = 0 - - for i in split_perc: - - progress.value += 1 - - t0 = time.time() - - ''' - MONITOR UPDATE STARTS - ''' - - perc_text = split_perc_text[counter] - monitor.iloc[1,1:] = 'Fitting Model on ' + perc_text + ' sample' - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - ''' - MONITOR UPDATE ENDS - ''' - - X_, X__, y_, y__ = train_test_split(X, y, test_size=1-i, stratify=y, random_state=seed, shuffle=data_split_shuffle) - X_train, X_test, y_train, y_test = train_test_split(X_, y_, test_size=0.3, stratify=y_, random_state=seed, shuffle=data_split_shuffle) - model.fit(X_train,y_train) - pred_ = model.predict(X_test) - try: - pred_prob = model.predict_proba(X_test)[:,1] - except: - logger.warning("model has no predict_proba attribute.") - pred_prob = 0 - - #accuracy - acc = metrics.accuracy_score(y_test,pred_) - metric_results.append(acc) - metric_name.append('Accuracy') - split_percent.append(i) - - #auc - if y.value_counts().count() > 2: - pass - else: - try: - auc = metrics.roc_auc_score(y_test,pred_prob) - metric_results.append(auc) - metric_name.append('AUC') - split_percent.append(i) - except: - pass - - #recall - if y.value_counts().count() > 2: - recall = metrics.recall_score(y_test,pred_, average='macro') - metric_results.append(recall) - metric_name.append('Recall') - split_percent.append(i) - else: - recall = metrics.recall_score(y_test,pred_) - metric_results.append(recall) - metric_name.append('Recall') - split_percent.append(i) - - #recall - if y.value_counts().count() > 2: - precision = metrics.precision_score(y_test,pred_, average='weighted') - metric_results.append(precision) - metric_name.append('Precision') - split_percent.append(i) - else: - precision = metrics.precision_score(y_test,pred_) - metric_results.append(precision) - metric_name.append('Precision') - split_percent.append(i) - - #F1 - if y.value_counts().count() > 2: - f1 = metrics.f1_score(y_test,pred_, average='weighted') - metric_results.append(f1) - metric_name.append('F1') - split_percent.append(i) - else: - f1 = metrics.precision_score(y_test,pred_) - metric_results.append(f1) - metric_name.append('F1') - split_percent.append(i) - - #Kappa - kappa = metrics.cohen_kappa_score(y_test,pred_) - metric_results.append(kappa) - metric_name.append('Kappa') - split_percent.append(i) - - t1 = time.time() - - ''' - Time calculation begins - ''' - - tt = t1 - t0 - total_tt = tt / i - split_perc_tt.pop(0) - - for remain in split_perc_tt: - ss = total_tt * remain - split_perc_tt_total.append(ss) - - ttt = sum(split_perc_tt_total) / 60 - ttt = np.around(ttt, 2) - - if ttt < 1: - ttt = str(np.around((ttt * 60), 2)) - ETC = ttt + ' Seconds Remaining' - - else: - ttt = str (ttt) - ETC = ttt + ' Minutes Remaining' - - monitor.iloc[2,1:] = ETC - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - - ''' - Time calculation Ends - ''' - - split_perc_tt_total = [] - counter += 1 - - model_results = pd.DataFrame({'Sample' : split_percent, 'Metric' : metric_results, 'Metric Name': metric_name}) - fig = px.line(model_results, x='Sample', y='Metric', color='Metric Name', line_shape='linear', range_y = [0,1]) - fig.update_layout(plot_bgcolor='rgb(245,245,245)') - title= str(model_name) + ' Metrics and Sample %' - fig.update_layout(title={'text': title, 'y':0.95,'x':0.45,'xanchor': 'center','yanchor': 'top'}) - fig.show() - - monitor.iloc[1,1:] = 'Waiting for input' - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - - print('Please Enter the sample % of data you would like to use for modeling. Example: Enter 0.3 for 30%.') - print('Press Enter if you would like to use 100% of the data.') - - sample_size = input("Sample Size: ") - - if sample_size == '' or sample_size == '1': - - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-train_size, stratify=y, random_state=seed, shuffle=data_split_shuffle) - - ''' - Final display Starts - ''' - clear_output() - if profile: - print('Setup Succesfully Completed! Loading Profile Now... Please Wait!') - else: - if verbose: - print('Setup Succesfully Completed!') - - functions = pd.DataFrame ( [ ['session_id', seed ], - ['Target Type', target_type], - ['Label Encoded', label_encoded], - ['Original Data', data_before_preprocess.shape ], - ['Missing Values ', missing_flag], - ['Numeric Features ', str(float_type) ], - ['Categorical Features ', str(cat_type) ], - ['Ordinal Features ', ordinal_features_grid], - ['High Cardinality Features ', high_cardinality_features_grid], - ['High Cardinality Method ', high_cardinality_method_grid], - ['Sampled Data', '(' + str(X_train.shape[0] + X_test.shape[0]) + ', ' + str(data_before_preprocess.shape[1]) + ')' ], - ['Transformed Train Set', X_train.shape ], - ['Transformed Test Set',X_test.shape ], - ['Numeric Imputer ', numeric_imputation], - ['Categorical Imputer ', categorical_imputation], - ['Normalize ', normalize ], - ['Normalize Method ', normalize_grid ], - ['Transformation ', transformation ], - ['Transformation Method ', transformation_grid ], - ['PCA ', pca], - ['PCA Method ', pca_method_grid], - ['PCA Components ', pca_components_grid], - ['Ignore Low Variance ', ignore_low_variance], - ['Combine Rare Levels ', combine_rare_levels], - ['Rare Level Threshold ', rare_level_threshold_grid], - ['Numeric Binning ', numeric_bin_grid], - ['Remove Outliers ', remove_outliers], - ['Outliers Threshold ', outliers_threshold_grid], - ['Remove Multicollinearity ', remove_multicollinearity], - ['Multicollinearity Threshold ', multicollinearity_threshold_grid], - ['Clustering ', create_clusters], - ['Clustering Iteration ', cluster_iter_grid], - ['Polynomial Features ', polynomial_features], - ['Polynomial Degree ', polynomial_degree_grid], - ['Trignometry Features ', trigonometry_features], - ['Polynomial Threshold ', polynomial_threshold_grid], - ['Group Features ', group_features_grid], - ['Feature Selection ', feature_selection], - ['Features Selection Threshold ', feature_selection_threshold_grid], - ['Feature Interaction ', feature_interaction], - ['Feature Ratio ', feature_ratio], - ['Interaction Threshold ', interaction_threshold_grid], - ['Fix Imbalance', fix_imbalance_param], - ['Fix Imbalance Method', fix_imbalance_model_name] - ], columns = ['Description', 'Value'] ) - - functions_ = functions.style.apply(highlight_max) - if verbose: - if html_param: - display(functions_) - else: - print(functions_.data) - - if profile: - try: - import pandas_profiling - pf = pandas_profiling.ProfileReport(data_before_preprocess) - clear_output() - display(pf) - except: - print('Data Profiler Failed. No output to show, please continue with Modeling.') - logger.error("Data Profiler Failed. No output to show, please continue with Modeling.") - - ''' - Final display Ends - ''' - - #log into experiment - experiment__.append(('Classification Setup Config', functions)) - experiment__.append(('X_training Set', X_train)) - experiment__.append(('y_training Set', y_train)) - experiment__.append(('X_test Set', X_test)) - experiment__.append(('y_test Set', y_test)) - experiment__.append(('Transformation Pipeline', prep_pipe)) - - else: - - sample_n = float(sample_size) - X_selected, X_discard, y_selected, y_discard = train_test_split(X, y, test_size=1-sample_n, stratify=y, - random_state=seed, shuffle=data_split_shuffle) - - X_train, X_test, y_train, y_test = train_test_split(X_selected, y_selected, test_size=1-train_size, stratify=y_selected, - random_state=seed, shuffle=data_split_shuffle) - clear_output() - - - ''' - Final display Starts - ''' - - - clear_output() - if profile: - print('Setup Succesfully Completed! Loading Profile Now... Please Wait!') - else: - if verbose: - print('Setup Succesfully Completed!') - - functions = pd.DataFrame ( [ ['session_id', seed ], - ['Target Type', target_type], - ['Label Encoded', label_encoded], - ['Original Data', data_before_preprocess.shape ], - ['Missing Values ', missing_flag], - ['Numeric Features ', str(float_type) ], - ['Categorical Features ', str(cat_type) ], - ['Ordinal Features ', ordinal_features_grid], - ['High Cardinality Features ', high_cardinality_features_grid], - ['High Cardinality Method ', high_cardinality_method_grid], - ['Sampled Data', '(' + str(X_train.shape[0] + X_test.shape[0]) + ', ' + str(data_before_preprocess.shape[1]) + ')' ], - ['Transformed Train Set', X_train.shape ], - ['Transformed Test Set',X_test.shape ], - ['Numeric Imputer ', numeric_imputation], - ['Categorical Imputer ', categorical_imputation], - ['Normalize ', normalize ], - ['Normalize Method ', normalize_grid ], - ['Transformation ', transformation ], - ['Transformation Method ', transformation_grid ], - ['PCA ', pca], - ['PCA Method ', pca_method_grid], - ['PCA Components ', pca_components_grid], - ['Ignore Low Variance ', ignore_low_variance], - ['Combine Rare Levels ', combine_rare_levels], - ['Rare Level Threshold ', rare_level_threshold_grid], - ['Numeric Binning ', numeric_bin_grid], - ['Remove Outliers ', remove_outliers], - ['Outliers Threshold ', outliers_threshold_grid], - ['Remove Multicollinearity ', remove_multicollinearity], - ['Multicollinearity Threshold ', multicollinearity_threshold_grid], - ['Clustering ', create_clusters], - ['Clustering Iteration ', cluster_iter_grid], - ['Polynomial Features ', polynomial_features], - ['Polynomial Degree ', polynomial_degree_grid], - ['Trignometry Features ', trigonometry_features], - ['Polynomial Threshold ', polynomial_threshold_grid], - ['Group Features ', group_features_grid], - ['Feature Selection ', feature_selection], - ['Features Selection Threshold ', feature_selection_threshold_grid], - ['Feature Interaction ', feature_interaction], - ['Feature Ratio ', feature_ratio], - ['Interaction Threshold ', interaction_threshold_grid], - ['Fix Imbalance', fix_imbalance_param], - ['Fix Imbalance Method', fix_imbalance_model_name] - ], columns = ['Description', 'Value'] ) - - #functions_ = functions.style.hide_index() - functions_ = functions.style.apply(highlight_max) - if verbose: - if html_param: - display(functions_) - else: - print(functions_.data) - - if profile: - try: - import pandas_profiling - pf = pandas_profiling.ProfileReport(data_before_preprocess) - clear_output() - display(pf) - except: - print('Data Profiler Failed. No output to show, please continue with Modeling.') - logger.error("Data Profiler Failed. No output to show, please continue with Modeling.") - - ''' - Final display Ends - ''' - - #log into experiment - experiment__.append(('Classification Setup Config', functions)) - experiment__.append(('X_training Set', X_train)) - experiment__.append(('y_training Set', y_train)) - experiment__.append(('X_test Set', X_test)) - experiment__.append(('y_test Set', y_test)) - experiment__.append(('Transformation Pipeline', prep_pipe)) - - else: - - monitor.iloc[1,1:] = 'Splitting Data' - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-train_size, stratify=y, random_state=seed, shuffle=data_split_shuffle) - progress.value += 1 - - clear_output() - - ''' - Final display Starts - ''' - clear_output() - if profile: - print('Setup Succesfully Completed! Loading Profile Now... Please Wait!') - else: - if verbose: - print('Setup Succesfully Completed!') - - functions = pd.DataFrame ( [ ['session_id', seed ], - ['Target Type', target_type], - ['Label Encoded', label_encoded], - ['Original Data', data_before_preprocess.shape ], - ['Missing Values ', missing_flag], - ['Numeric Features ', str(float_type) ], - ['Categorical Features ', str(cat_type) ], - ['Ordinal Features ', ordinal_features_grid], - ['High Cardinality Features ', high_cardinality_features_grid], - ['High Cardinality Method ', high_cardinality_method_grid], - ['Sampled Data', '(' + str(X_train.shape[0] + X_test.shape[0]) + ', ' + str(data_before_preprocess.shape[1]) + ')' ], - ['Transformed Train Set', X_train.shape ], - ['Transformed Test Set',X_test.shape ], - ['Numeric Imputer ', numeric_imputation], - ['Categorical Imputer ', categorical_imputation], - ['Normalize ', normalize ], - ['Normalize Method ', normalize_grid ], - ['Transformation ', transformation ], - ['Transformation Method ', transformation_grid ], - ['PCA ', pca], - ['PCA Method ', pca_method_grid], - ['PCA Components ', pca_components_grid], - ['Ignore Low Variance ', ignore_low_variance], - ['Combine Rare Levels ', combine_rare_levels], - ['Rare Level Threshold ', rare_level_threshold_grid], - ['Numeric Binning ', numeric_bin_grid], - ['Remove Outliers ', remove_outliers], - ['Outliers Threshold ', outliers_threshold_grid], - ['Remove Multicollinearity ', remove_multicollinearity], - ['Multicollinearity Threshold ', multicollinearity_threshold_grid], - ['Clustering ', create_clusters], - ['Clustering Iteration ', cluster_iter_grid], - ['Polynomial Features ', polynomial_features], - ['Polynomial Degree ', polynomial_degree_grid], - ['Trignometry Features ', trigonometry_features], - ['Polynomial Threshold ', polynomial_threshold_grid], - ['Group Features ', group_features_grid], - ['Feature Selection ', feature_selection], - ['Features Selection Threshold ', feature_selection_threshold_grid], - ['Feature Interaction ', feature_interaction], - ['Feature Ratio ', feature_ratio], - ['Interaction Threshold ', interaction_threshold_grid], - ['Fix Imbalance', fix_imbalance_param], - ['Fix Imbalance Method', fix_imbalance_model_name] - ], columns = ['Description', 'Value'] ) - - functions_ = functions.style.apply(highlight_max) - if verbose: - if html_param: - display(functions_) - else: - print(functions_.data) - - if profile: - try: - import pandas_profiling - pf = pandas_profiling.ProfileReport(data_before_preprocess) - clear_output() - display(pf) - except: - print('Data Profiler Failed. No output to show, please continue with Modeling.') - logger.error("Data Profiler Failed. No output to show, please continue with Modeling.") - - ''' - Final display Ends - ''' - - #log into experiment - experiment__.append(('Classification Setup Config', functions)) - experiment__.append(('X_training Set', X_train)) - experiment__.append(('y_training Set', y_train)) - experiment__.append(('X_test Set', X_test)) - experiment__.append(('y_test Set', y_test)) - experiment__.append(('Transformation Pipeline', prep_pipe)) - - #end runtime - runtime_end = time.time() - runtime = np.array(runtime_end - runtime_start).round(2) - - if logging_param: - - logger.info("Logging experiment in MLFlow") - - import mlflow - from pathlib import Path - - if experiment_name is None: - exp_name_ = 'clf-default-name' - else: - exp_name_ = experiment_name - - URI = secrets.token_hex(nbytes=4) - exp_name_log = exp_name_ - - try: - mlflow.create_experiment(exp_name_log) - except: - pass - - #mlflow logging - mlflow.set_experiment(exp_name_log) - - run_name_ = 'Session Initialized ' + str(USI) - - with mlflow.start_run(run_name=run_name_) as run: - - # Get active run to log as tag - RunID = mlflow.active_run().info.run_id - - k = functions.copy() - k.set_index('Description',drop=True,inplace=True) - kdict = k.to_dict() - params = kdict.get('Value') - mlflow.log_params(params) - - #set tag of compare_models - mlflow.set_tag("Source", "setup") - - import secrets - URI = secrets.token_hex(nbytes=4) - mlflow.set_tag("URI", URI) - - mlflow.set_tag("USI", USI) - - mlflow.set_tag("Run Time", runtime) - - mlflow.set_tag("Run ID", RunID) - - # Log the transformation pipeline - logger.info("SubProcess save_model() called ==================================") - save_model(prep_pipe, 'Transformation Pipeline', verbose=False) - logger.info("SubProcess save_model() end ==================================") - mlflow.log_artifact('Transformation Pipeline' + '.pkl') - size_bytes = Path('Transformation Pipeline.pkl').stat().st_size - size_kb = np.round(size_bytes/1000, 2) - mlflow.set_tag("Size KB", size_kb) - os.remove('Transformation Pipeline.pkl') - - # Log pandas profile - if log_profile: - import pandas_profiling - pf = pandas_profiling.ProfileReport(data_before_preprocess) - pf.to_file("Data Profile.html") - mlflow.log_artifact("Data Profile.html") - os.remove("Data Profile.html") - clear_output() - display(functions_) - - # Log training and testing set - if log_data: - X_train.join(y_train).to_csv('Train.csv') - X_test.join(y_test).to_csv('Test.csv') - mlflow.log_artifact("Train.csv") - mlflow.log_artifact("Test.csv") - os.remove('Train.csv') - os.remove('Test.csv') - - # Log input.txt that contains name of columns required in dataset - # to use this pipeline based on USI/URI. - - input_cols = list(data_before_preprocess.columns) - input_cols.remove(target) - - with open("input.txt", "w") as output: - output.write(str(input_cols)) - - mlflow.log_artifact("input.txt") - os.remove('input.txt') - - logger.info("create_model_container " + str(len(create_model_container))) - logger.info("master_model_container " + str(len(master_model_container))) - logger.info("display_container " + str(len(display_container))) - - logger.info(str(prep_pipe)) - logger.info("setup() succesfully completed......................................") - - return X, y, X_train, X_test, y_train, y_test, seed, prep_pipe, experiment__,\ - folds_shuffle_param, n_jobs_param, html_param, create_model_container, master_model_container,\ - display_container, exp_name_log, logging_param, log_plots_param, USI,\ - fix_imbalance_param, fix_imbalance_method_param, logger - -def create_model(estimator = None, - ensemble = False, - method = None, - fold = 10, - round = 4, - cross_validation = True, #added in pycaret==2.0.0 - verbose = True, - system = True, #added in pycaret==2.0.0 - **kwargs): #added in pycaret==2.0.0 - - """ - - Description: - ------------ - This function creates a model and scores it using Stratified Cross Validation. - The output prints a score grid that shows Accuracy, AUC, Recall, Precision, - F1, Kappa and MCC by fold (default = 10 Fold). - - This function returns a trained model object. - - setup() function must be called before using create_model() - - Example - ------- - from pycaret.datasets import get_data - juice = get_data('juice') - experiment_name = setup(data = juice, target = 'Purchase') - - lr = create_model('lr') - - This will create a trained Logistic Regression model. - - Parameters - ---------- - estimator : string / object, default = None - - Enter ID of the estimators available in model library or pass an untrained model - object consistent with fit / predict API to train and evaluate model. All estimators - support binary or multiclass problem. List of estimators in model library: - - ID Name - -------- ---------- - 'lr' Logistic Regression - 'knn' K Nearest Neighbour - 'nb' Naive Bayes - 'dt' Decision Tree Classifier - 'svm' SVM - Linear Kernel - 'rbfsvm' SVM - Radial Kernel - 'gpc' Gaussian Process Classifier - 'mlp' Multi Level Perceptron - 'ridge' Ridge Classifier - 'rf' Random Forest Classifier - 'qda' Quadratic Discriminant Analysis - 'ada' Ada Boost Classifier - 'gbc' Gradient Boosting Classifier - 'lda' Linear Discriminant Analysis - 'et' Extra Trees Classifier - 'xgboost' Extreme Gradient Boosting - 'lightgbm' Light Gradient Boosting - 'catboost' CatBoost Classifier - - ensemble: Boolean, default = False - True would result in an ensemble of estimator using the method parameter defined. - - method: String, 'Bagging' or 'Boosting', default = None. - method must be defined when ensemble is set to True. Default method is set to None. - - fold: integer, default = 10 - Number of folds to be used in Kfold CV. Must be at least 2. - - round: integer, default = 4 - Number of decimal places the metrics in the score grid will be rounded to. - - cross_validation: bool, default = True - When cross_validation set to False fold parameter is ignored and model is trained - on entire training dataset. No metric evaluation is returned. - - verbose: Boolean, default = True - Score grid is not printed when verbose is set to False. - - system: Boolean, default = True - Must remain True all times. Only to be changed by internal functions. - - **kwargs: - Additional keyword arguments to pass to the estimator. - - Returns: - -------- - - score grid: A table containing the scores of the model across the kfolds. - ----------- Scoring metrics used are Accuracy, AUC, Recall, Precision, F1, - Kappa and MCC. Mean and standard deviation of the scores across - the folds are highlighted in yellow. - - model: trained model object - ----------- - - Warnings: - --------- - - 'svm' and 'ridge' doesn't support predict_proba method. As such, AUC will be - returned as zero (0.0) - - - If target variable is multiclass (more than 2 classes), AUC will be returned - as zero (0.0) - - - 'rbfsvm' and 'gpc' uses non-linear kernel and hence the fit time complexity is - more than quadratic. These estimators are hard to scale on datasets with more - than 10,000 samples. - - - - """ - - - ''' - - ERROR HANDLING STARTS HERE - - ''' - - import logging - - try: - hasattr(logger, 'name') - except: - logger = logging.getLogger('logs') - logger.setLevel(logging.DEBUG) - - # create console handler and set level to debug - if logger.hasHandlers(): - logger.handlers.clear() - - ch = logging.FileHandler('logs.log') - ch.setLevel(logging.DEBUG) - - # create formatter - formatter = logging.Formatter('%(asctime)s:%(levelname)s:%(message)s') - - # add formatter to ch - ch.setFormatter(formatter) - - # add ch to logger - logger.addHandler(ch) - - logger.info("Initializing create_model()") - logger.info("""create_model(estimator={}, ensemble={}, method={}, fold={}, round={}, cross_validation={}, verbose={}, system={})""".\ - format(str(estimator), str(ensemble), str(method), str(fold), str(round), str(cross_validation), str(verbose), str(system))) - - logger.info("Checking exceptions") - - #exception checking - import sys - - #run_time - import datetime, time - runtime_start = time.time() - - #checking error for estimator (string) - available_estimators = ['lr', 'knn', 'nb', 'dt', 'svm', 'rbfsvm', 'gpc', 'mlp', 'ridge', 'rf', 'qda', 'ada', - 'gbc', 'lda', 'et', 'xgboost', 'lightgbm', 'catboost'] - - #only raise exception of estimator is of type string. - if type(estimator) is str: - if estimator not in available_estimators: - sys.exit('(Value Error): Estimator Not Available. Please see docstring for list of available estimators.') - - #checking error for ensemble: - if type(ensemble) is not bool: - sys.exit('(Type Error): Ensemble parameter can only take argument as True or False.') - - #checking error for method: - - #1 Check When method given and ensemble is not set to True. - if ensemble is False and method is not None: - sys.exit('(Type Error): Method parameter only accepts value when ensemble is set to True.') - - #2 Check when ensemble is set to True and method is not passed. - if ensemble is True and method is None: - sys.exit("(Type Error): Method parameter missing. Pass method = 'Bagging' or 'Boosting'.") - - #3 Check when ensemble is set to True and method is passed but not allowed. - available_method = ['Bagging', 'Boosting'] - if ensemble is True and method not in available_method: - sys.exit("(Value Error): Method parameter only accepts two values 'Bagging' or 'Boosting'.") - - #checking fold parameter - if type(fold) is not int: - sys.exit('(Type Error): Fold parameter only accepts integer value.') - - #checking round parameter - if type(round) is not int: - sys.exit('(Type Error): Round parameter only accepts integer value.') - - #checking verbose parameter - if type(verbose) is not bool: - sys.exit('(Type Error): Verbose parameter can only take argument as True or False.') - - #checking system parameter - if type(system) is not bool: - sys.exit('(Type Error): System parameter can only take argument as True or False.') - - #checking cross_validation parameter - if type(cross_validation) is not bool: - sys.exit('(Type Error): cross_validation parameter can only take argument as True or False.') - - #checking boosting conflict with estimators - boosting_not_supported = ['lda','qda','ridge','mlp','gpc','svm','knn', 'catboost'] - if method == 'Boosting' and estimator in boosting_not_supported: - sys.exit("(Type Error): Estimator does not provide class_weights or predict_proba function and hence not supported for the Boosting method. Change the estimator or method to 'Bagging'.") - - - ''' - - ERROR HANDLING ENDS HERE - - ''' - - logger.info("Preloading libraries") - - #pre-load libraries - import pandas as pd - import ipywidgets as ipw - from IPython.display import display, HTML, clear_output, update_display - - logger.info("Preparing display monitor") - - #progress bar - progress = ipw.IntProgress(value=0, min=0, max=fold+4, step=1 , description='Processing: ') - master_display = pd.DataFrame(columns=['Accuracy','AUC','Recall', 'Prec.', 'F1', 'Kappa', 'MCC']) - if verbose: - if html_param: - display(progress) - - #display monitor - timestampStr = datetime.datetime.now().strftime("%H:%M:%S") - monitor = pd.DataFrame( [ ['Initiated' , '. . . . . . . . . . . . . . . . . .', timestampStr ], - ['Status' , '. . . . . . . . . . . . . . . . . .' , 'Loading Dependencies' ], - ['ETC' , '. . . . . . . . . . . . . . . . . .', 'Calculating ETC'] ], - columns=['', ' ', ' ']).set_index('') - - if verbose: - if html_param: - display(monitor, display_id = 'monitor') - - if verbose: - if html_param: - display_ = display(master_display, display_id=True) - display_id = display_.display_id - - #ignore warnings - import warnings - warnings.filterwarnings('ignore') - - logger.info("Copying training dataset") - - #Storing X_train and y_train in data_X and data_y parameter - data_X = X_train.copy() - data_y = y_train.copy() - - #reset index - data_X.reset_index(drop=True, inplace=True) - data_y.reset_index(drop=True, inplace=True) - - logger.info("Importing libraries") - - #general dependencies - import numpy as np - from sklearn import metrics - from sklearn.model_selection import StratifiedKFold - - progress.value += 1 - - logger.info("Defining folds") - - #cross validation setup starts here - kf = StratifiedKFold(fold, random_state=seed, shuffle=folds_shuffle_param) - - logger.info("Declaring metric variables") - - score_auc =np.empty((0,0)) - score_acc =np.empty((0,0)) - score_recall =np.empty((0,0)) - score_precision =np.empty((0,0)) - score_f1 =np.empty((0,0)) - score_kappa =np.empty((0,0)) - score_mcc =np.empty((0,0)) - score_training_time =np.empty((0,0)) - avgs_auc =np.empty((0,0)) - avgs_acc =np.empty((0,0)) - avgs_recall =np.empty((0,0)) - avgs_precision =np.empty((0,0)) - avgs_f1 =np.empty((0,0)) - avgs_kappa =np.empty((0,0)) - avgs_mcc =np.empty((0,0)) - avgs_training_time =np.empty((0,0)) - - - ''' - MONITOR UPDATE STARTS - ''' - - monitor.iloc[1,1:] = 'Selecting Estimator' - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - ''' - MONITOR UPDATE ENDS - ''' - - logger.info("Importing untrained model") - - if estimator == 'lr': - - from sklearn.linear_model import LogisticRegression - model = LogisticRegression(random_state=seed, **kwargs) - full_name = 'Logistic Regression' - - elif estimator == 'knn': - - from sklearn.neighbors import KNeighborsClassifier - model = KNeighborsClassifier(n_jobs=n_jobs_param, **kwargs) - full_name = 'K Neighbors Classifier' - - elif estimator == 'nb': - - from sklearn.naive_bayes import GaussianNB - model = GaussianNB(**kwargs) - full_name = 'Naive Bayes' - - elif estimator == 'dt': - - from sklearn.tree import DecisionTreeClassifier - model = DecisionTreeClassifier(random_state=seed, **kwargs) - full_name = 'Decision Tree Classifier' - - elif estimator == 'svm': - - from sklearn.linear_model import SGDClassifier - model = SGDClassifier(max_iter=1000, tol=0.001, random_state=seed, n_jobs=n_jobs_param, **kwargs) - full_name = 'SVM - Linear Kernel' - - elif estimator == 'rbfsvm': - - from sklearn.svm import SVC - model = SVC(gamma='auto', C=1, probability=True, kernel='rbf', random_state=seed, **kwargs) - full_name = 'SVM - Radial Kernel' - - elif estimator == 'gpc': - - from sklearn.gaussian_process import GaussianProcessClassifier - model = GaussianProcessClassifier(random_state=seed, n_jobs=n_jobs_param, **kwargs) - full_name = 'Gaussian Process Classifier' - - elif estimator == 'mlp': - - from sklearn.neural_network import MLPClassifier - model = MLPClassifier(max_iter=500, random_state=seed, **kwargs) - full_name = 'MLP Classifier' - - elif estimator == 'ridge': - - from sklearn.linear_model import RidgeClassifier - model = RidgeClassifier(random_state=seed, **kwargs) - full_name = 'Ridge Classifier' - - elif estimator == 'rf': - - from sklearn.ensemble import RandomForestClassifier - model = RandomForestClassifier(n_estimators=10, random_state=seed, n_jobs=n_jobs_param, **kwargs) - full_name = 'Random Forest Classifier' - - elif estimator == 'qda': - - from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis - model = QuadraticDiscriminantAnalysis(**kwargs) - full_name = 'Quadratic Discriminant Analysis' - - elif estimator == 'ada': - - from sklearn.ensemble import AdaBoostClassifier - model = AdaBoostClassifier(random_state=seed, **kwargs) - full_name = 'Ada Boost Classifier' - - elif estimator == 'gbc': - - from sklearn.ensemble import GradientBoostingClassifier - model = GradientBoostingClassifier(random_state=seed, **kwargs) - full_name = 'Gradient Boosting Classifier' - - elif estimator == 'lda': - - from sklearn.discriminant_analysis import LinearDiscriminantAnalysis - model = LinearDiscriminantAnalysis(**kwargs) - full_name = 'Linear Discriminant Analysis' - - elif estimator == 'et': - - from sklearn.ensemble import ExtraTreesClassifier - model = ExtraTreesClassifier(random_state=seed, n_jobs=n_jobs_param, **kwargs) - full_name = 'Extra Trees Classifier' - - elif estimator == 'xgboost': - - from xgboost import XGBClassifier - model = XGBClassifier(random_state=seed, verbosity=0, n_jobs=n_jobs_param, **kwargs) - full_name = 'Extreme Gradient Boosting' - - elif estimator == 'lightgbm': - - import lightgbm as lgb - model = lgb.LGBMClassifier(random_state=seed, n_jobs=n_jobs_param, **kwargs) - full_name = 'Light Gradient Boosting Machine' - - elif estimator == 'catboost': - from catboost import CatBoostClassifier - model = CatBoostClassifier(random_state=seed, silent=True, thread_count=n_jobs_param, **kwargs) # Silent is True to suppress CatBoost iteration results - full_name = 'CatBoost Classifier' - - else: - - logger.info("Declaring custom model") - - model = estimator - - def get_model_name(e): - return str(e).split("(")[0] - - model_dict_logging = {'ExtraTreesClassifier' : 'Extra Trees Classifier', - 'GradientBoostingClassifier' : 'Gradient Boosting Classifier', - 'RandomForestClassifier' : 'Random Forest Classifier', - 'LGBMClassifier' : 'Light Gradient Boosting Machine', - 'XGBClassifier' : 'Extreme Gradient Boosting', - 'AdaBoostClassifier' : 'Ada Boost Classifier', - 'DecisionTreeClassifier' : 'Decision Tree Classifier', - 'RidgeClassifier' : 'Ridge Classifier', - 'LogisticRegression' : 'Logistic Regression', - 'KNeighborsClassifier' : 'K Neighbors Classifier', - 'GaussianNB' : 'Naive Bayes', - 'SGDClassifier' : 'SVM - Linear Kernel', - 'SVC' : 'SVM - Radial Kernel', - 'GaussianProcessClassifier' : 'Gaussian Process Classifier', - 'MLPClassifier' : 'MLP Classifier', - 'QuadraticDiscriminantAnalysis' : 'Quadratic Discriminant Analysis', - 'LinearDiscriminantAnalysis' : 'Linear Discriminant Analysis', - 'CatBoostClassifier' : 'CatBoost Classifier', - 'BaggingClassifier' : 'Bagging Classifier', - 'VotingClassifier' : 'Voting Classifier'} - - if y.value_counts().count() > 2: - - mn = get_model_name(estimator.estimator) - - if 'catboost' in mn: - mn = 'CatBoostClassifier' - - if mn in model_dict_logging.keys(): - full_name = model_dict_logging.get(mn) - else: - full_name = mn - - else: - - mn = get_model_name(estimator) - - if 'catboost' in mn: - mn = 'CatBoostClassifier' - - if mn in model_dict_logging.keys(): - full_name = model_dict_logging.get(mn) - else: - full_name = mn - - logger.info(str(full_name) + ' Imported succesfully') - - progress.value += 1 - - #checking method when ensemble is set to True. - - logger.info("Checking ensemble method") - - if method == 'Bagging': - logger.info("Ensemble method set to Bagging") - from sklearn.ensemble import BaggingClassifier - model = BaggingClassifier(model,bootstrap=True,n_estimators=10, random_state=seed, n_jobs=n_jobs_param) - - elif method == 'Boosting': - logger.info("Ensemble method set to Boosting") - from sklearn.ensemble import AdaBoostClassifier - model = AdaBoostClassifier(model, n_estimators=10, random_state=seed) - - #multiclass checking - if y.value_counts().count() > 2: - logger.info("Target variable is Multiclass. OneVsRestClassifier activated") - from sklearn.multiclass import OneVsRestClassifier - model = OneVsRestClassifier(model, n_jobs=n_jobs_param) - - - ''' - MONITOR UPDATE STARTS - ''' - - if not cross_validation: - monitor.iloc[1,1:] = 'Fitting ' + str(full_name) - else: - monitor.iloc[1,1:] = 'Initializing CV' - - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - ''' - MONITOR UPDATE ENDS - ''' - - if not cross_validation: - - logger.info("Cross validation set to False") - - if fix_imbalance_param: - logger.info("Initializing SMOTE") - if fix_imbalance_method_param is None: - from imblearn.over_sampling import SMOTE - resampler = SMOTE(random_state=seed) - else: - resampler = fix_imbalance_method_param - - Xtrain,ytrain = resampler.fit_sample(data_X,data_y) - logger.info("Resampling completed") - - logger.info("Fitting Model") - model.fit(data_X,data_y) - - if verbose: - clear_output() - - logger.info("create_model_container " + str(len(create_model_container))) - logger.info("master_model_container " + str(len(master_model_container))) - logger.info("display_container " + str(len(display_container))) - - logger.info(str(model)) - logger.info("create_models() succesfully completed......................................") - - return model - - fold_num = 1 - - for train_i , test_i in kf.split(data_X,data_y): - - logger.info("Initializing Fold " + str(fold_num)) - - t0 = time.time() - - ''' - MONITOR UPDATE STARTS - ''' - - monitor.iloc[1,1:] = 'Fitting Fold ' + str(fold_num) + ' of ' + str(fold) - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - ''' - MONITOR UPDATE ENDS - ''' - - Xtrain,Xtest = data_X.iloc[train_i], data_X.iloc[test_i] - ytrain,ytest = data_y.iloc[train_i], data_y.iloc[test_i] - time_start=time.time() - - if fix_imbalance_param: - - logger.info("Initializing SMOTE") - - if fix_imbalance_method_param is None: - from imblearn.over_sampling import SMOTE - resampler = SMOTE(random_state=seed) - else: - resampler = fix_imbalance_method_param - - Xtrain,ytrain = resampler.fit_sample(Xtrain, ytrain) - logger.info("Resampling completed") - - if hasattr(model, 'predict_proba'): - logger.info("Fitting Model") - model.fit(Xtrain,ytrain) - logger.info("Evaluating Metrics") - pred_prob = model.predict_proba(Xtest) - pred_prob = pred_prob[:,1] - pred_ = model.predict(Xtest) - sca = metrics.accuracy_score(ytest,pred_) - - if y.value_counts().count() > 2: - sc = 0 - recall = metrics.recall_score(ytest,pred_, average='macro') - precision = metrics.precision_score(ytest,pred_, average = 'weighted') - f1 = metrics.f1_score(ytest,pred_, average='weighted') - - else: - try: - sc = metrics.roc_auc_score(ytest,pred_prob) - except: - logger.warning("model has no predict_proba attribute. AUC set to 0.00") - sc = 0 - recall = metrics.recall_score(ytest,pred_) - precision = metrics.precision_score(ytest,pred_) - f1 = metrics.f1_score(ytest,pred_) - else: - logger.info("Fitting Model") - model.fit(Xtrain,ytrain) - logger.info("Evaluating Metrics") - logger.warning("model has no predict_proba attribute. pred_prob set to 0.00") - pred_prob = 0.00 - pred_ = model.predict(Xtest) - sca = metrics.accuracy_score(ytest,pred_) - - if y.value_counts().count() > 2: - sc = 0 - recall = metrics.recall_score(ytest,pred_, average='macro') - precision = metrics.precision_score(ytest,pred_, average = 'weighted') - f1 = metrics.f1_score(ytest,pred_, average='weighted') - - else: - try: - sc = metrics.roc_auc_score(ytest,pred_prob) - except: - sc = 0 - logger.warning("model has no predict_proba attribute. AUC to 0.00") - recall = metrics.recall_score(ytest,pred_) - precision = metrics.precision_score(ytest,pred_) - f1 = metrics.f1_score(ytest,pred_) - - logger.info("Compiling Metrics") - time_end=time.time() - kappa = metrics.cohen_kappa_score(ytest,pred_) - mcc = metrics.matthews_corrcoef(ytest,pred_) - training_time=time_end-time_start - score_acc = np.append(score_acc,sca) - score_auc = np.append(score_auc,sc) - score_recall = np.append(score_recall,recall) - score_precision = np.append(score_precision,precision) - score_f1 =np.append(score_f1,f1) - score_kappa =np.append(score_kappa,kappa) - score_mcc=np.append(score_mcc,mcc) - score_training_time = np.append(score_training_time,training_time) - - progress.value += 1 - - ''' - - This section handles time calculation and is created to update_display() as code loops through - the fold defined. - - ''' - - fold_results = pd.DataFrame({'Accuracy':[sca], 'AUC': [sc], 'Recall': [recall], - 'Prec.': [precision], 'F1': [f1], 'Kappa': [kappa], 'MCC':[mcc]}).round(round) - master_display = pd.concat([master_display, fold_results],ignore_index=True) - fold_results = [] - - ''' - TIME CALCULATION SUB-SECTION STARTS HERE - ''' - t1 = time.time() - - tt = (t1 - t0) * (fold-fold_num) / 60 - tt = np.around(tt, 2) - - if tt < 1: - tt = str(np.around((tt * 60), 2)) - ETC = tt + ' Seconds Remaining' - - else: - tt = str (tt) - ETC = tt + ' Minutes Remaining' - - ''' - MONITOR UPDATE STARTS - ''' - - monitor.iloc[2,1:] = ETC - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - ''' - MONITOR UPDATE ENDS - ''' - - fold_num += 1 - - ''' - TIME CALCULATION ENDS HERE - ''' - - if verbose: - if html_param: - update_display(master_display, display_id = display_id) - - - ''' - - Update_display() ends here - - ''' - - logger.info("Calculating mean and std") - - mean_acc=np.mean(score_acc) - mean_auc=np.mean(score_auc) - mean_recall=np.mean(score_recall) - mean_precision=np.mean(score_precision) - mean_f1=np.mean(score_f1) - mean_kappa=np.mean(score_kappa) - mean_mcc=np.mean(score_mcc) - mean_training_time=np.sum(score_training_time) #changed it to sum from mean - - std_acc=np.std(score_acc) - std_auc=np.std(score_auc) - std_recall=np.std(score_recall) - std_precision=np.std(score_precision) - std_f1=np.std(score_f1) - std_kappa=np.std(score_kappa) - std_mcc=np.std(score_mcc) - std_training_time=np.std(score_training_time) - - avgs_acc = np.append(avgs_acc, mean_acc) - avgs_acc = np.append(avgs_acc, std_acc) - avgs_auc = np.append(avgs_auc, mean_auc) - avgs_auc = np.append(avgs_auc, std_auc) - avgs_recall = np.append(avgs_recall, mean_recall) - avgs_recall = np.append(avgs_recall, std_recall) - avgs_precision = np.append(avgs_precision, mean_precision) - avgs_precision = np.append(avgs_precision, std_precision) - avgs_f1 = np.append(avgs_f1, mean_f1) - avgs_f1 = np.append(avgs_f1, std_f1) - avgs_kappa = np.append(avgs_kappa, mean_kappa) - avgs_kappa = np.append(avgs_kappa, std_kappa) - avgs_mcc = np.append(avgs_mcc, mean_mcc) - avgs_mcc = np.append(avgs_mcc, std_mcc) - - avgs_training_time = np.append(avgs_training_time, mean_training_time) - avgs_training_time = np.append(avgs_training_time, std_training_time) - - progress.value += 1 - - logger.info("Creating metrics dataframe") - - model_results = pd.DataFrame({'Accuracy': score_acc, 'AUC': score_auc, 'Recall' : score_recall, 'Prec.' : score_precision , - 'F1' : score_f1, 'Kappa' : score_kappa, 'MCC': score_mcc}) - model_avgs = pd.DataFrame({'Accuracy': avgs_acc, 'AUC': avgs_auc, 'Recall' : avgs_recall, 'Prec.' : avgs_precision , - 'F1' : avgs_f1, 'Kappa' : avgs_kappa, 'MCC': avgs_mcc},index=['Mean', 'SD']) - - - model_results = model_results.append(model_avgs) - model_results = model_results.round(round) - - # yellow the mean - model_results=model_results.style.apply(lambda x: ['background: yellow' if (x.name == 'Mean') else '' for i in x], axis=1) - model_results = model_results.set_precision(round) - - #refitting the model on complete X_train, y_train - monitor.iloc[1,1:] = 'Finalizing Model' - monitor.iloc[2,1:] = 'Almost Finished' - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - model_fit_start = time.time() - logger.info("Finalizing model") - model.fit(data_X, data_y) - model_fit_end = time.time() - - model_fit_time = np.array(model_fit_end - model_fit_start).round(2) - - #end runtime - runtime_end = time.time() - runtime = np.array(runtime_end - runtime_start).round(2) - - #mlflow logging - if logging_param and system: - - logger.info("Creating MLFlow logs") - - #Creating Logs message monitor - monitor.iloc[1,1:] = 'Creating Logs' - monitor.iloc[2,1:] = 'Almost Finished' - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - #import mlflow - import mlflow - import mlflow.sklearn - from pathlib import Path - import os - - mlflow.set_experiment(exp_name_log) - - with mlflow.start_run(run_name=full_name) as run: - - # Get active run to log as tag - RunID = mlflow.active_run().info.run_id - - # Log model parameters - params = model.get_params() - - for i in list(params): - v = params.get(i) - if len(str(v)) > 250: - params.pop(i) - - mlflow.log_params(params) - - # Log metrics - mlflow.log_metrics({"Accuracy": avgs_acc[0], "AUC": avgs_auc[0], "Recall": avgs_recall[0], "Precision" : avgs_precision[0], - "F1": avgs_f1[0], "Kappa": avgs_kappa[0], "MCC": avgs_mcc[0]}) - - #set tag of compare_models - mlflow.set_tag("Source", "create_model") - - import secrets - URI = secrets.token_hex(nbytes=4) - mlflow.set_tag("URI", URI) - mlflow.set_tag("USI", USI) - mlflow.set_tag("Run Time", runtime) - mlflow.set_tag("Run ID", RunID) - - # Log training time in seconds - mlflow.log_metric("TT", model_fit_time) - - # Log the CV results as model_results.html artifact - model_results.data.to_html('Results.html', col_space=65, justify='left') - mlflow.log_artifact('Results.html') - os.remove('Results.html') - - # Generate hold-out predictions and save as html - holdout = predict_model(model, verbose=False) - holdout_score = pull() - display_container.pop(-1) - holdout_score.to_html('Holdout.html', col_space=65, justify='left') - mlflow.log_artifact('Holdout.html') - os.remove('Holdout.html') - - # Log AUC and Confusion Matrix plot - - if log_plots_param: - - logger.info("SubProcess plot_model() called ==================================") - - try: - plot_model(model, plot = 'auc', verbose=False, save=True, system=False) - mlflow.log_artifact('AUC.png') - os.remove("AUC.png") - except: - pass - - try: - plot_model(model, plot = 'confusion_matrix', verbose=False, save=True, system=False) - mlflow.log_artifact('Confusion Matrix.png') - os.remove("Confusion Matrix.png") - except: - pass - - try: - plot_model(model, plot = 'feature', verbose=False, save=True, system=False) - mlflow.log_artifact('Feature Importance.png') - os.remove("Feature Importance.png") - except: - pass - - logger.info("SubProcess plot_model() end ==================================") - - # Log model and transformation pipeline - logger.info("SubProcess save_model() called ==================================") - save_model(model, 'Trained Model', verbose=False) - logger.info("SubProcess save_model() end ==================================") - mlflow.log_artifact('Trained Model' + '.pkl') - size_bytes = Path('Trained Model.pkl').stat().st_size - size_kb = np.round(size_bytes/1000, 2) - mlflow.set_tag("Size KB", size_kb) - os.remove('Trained Model.pkl') - - progress.value += 1 - - logger.info("Uploading results into container") - - #storing results in create_model_container - create_model_container.append(model_results.data) - display_container.append(model_results.data) - - #storing results in master_model_container - logger.info("Uploading model into container now") - master_model_container.append(model) - - if verbose: - clear_output() - - if html_param: - display(model_results) - else: - print(model_results.data) - - logger.info("create_model_container: " + str(len(create_model_container))) - logger.info("master_model_container: " + str(len(master_model_container))) - logger.info("display_container: " + str(len(display_container))) - - logger.info(str(model)) - logger.info("create_model() succesfully completed......................................") - return model - -def ensemble_model(estimator, - method = 'Bagging', - fold = 10, - n_estimators = 10, - round = 4, - choose_better = False, #added in pycaret==2.0.0 - optimize = 'Accuracy', #added in pycaret==2.0.0 - verbose = True): - """ - - - Description: - ------------ - This function ensembles the trained base estimator using the method defined in - 'method' param (default = 'Bagging'). The output prints a score grid that shows - Accuracy, AUC, Recall, Precision, F1, Kappa and MCC by fold (default = 10 Fold). - - This function returns a trained model object. - - Model must be created using create_model() or tune_model(). - - Example - ------- - from pycaret.datasets import get_data - juice = get_data('juice') - experiment_name = setup(data = juice, target = 'Purchase') - dt = create_model('dt') - - ensembled_dt = ensemble_model(dt) - - This will return an ensembled Decision Tree model using 'Bagging'. - - Parameters - ---------- - estimator : object, default = None - - method: String, default = 'Bagging' - Bagging method will create an ensemble meta-estimator that fits base - classifiers each on random subsets of the original dataset. The other - available method is 'Boosting' which will create a meta-estimators by - fitting a classifier on the original dataset and then fits additional - copies of the classifier on the same dataset but where the weights of - incorrectly classified instances are adjusted such that subsequent - classifiers focus more on difficult cases. - - fold: integer, default = 10 - Number of folds to be used in Kfold CV. Must be at least 2. - - n_estimators: integer, default = 10 - The number of base estimators in the ensemble. - In case of perfect fit, the learning procedure is stopped early. - - round: integer, default = 4 - Number of decimal places the metrics in the score grid will be rounded to. - - choose_better: Boolean, default = False - When set to set to True, base estimator is returned when the metric doesn't - improve by ensemble_model. This gurantees the returned object would perform - atleast equivalent to base estimator created using create_model or model - returned by compare_models. - - optimize: string, default = 'Accuracy' - Only used when choose_better is set to True. optimize parameter is used - to compare emsembled model with base estimator. Values accepted in - optimize parameter are 'Accuracy', 'AUC', 'Recall', 'Precision', 'F1', - 'Kappa', 'MCC'. - - verbose: Boolean, default = True - Score grid is not printed when verbose is set to False. - - Returns: - -------- - - score grid: A table containing the scores of the model across the kfolds. - ----------- Scoring metrics used are Accuracy, AUC, Recall, Precision, F1, - Kappa and MCC. Mean and standard deviation of the scores across - the folds are also returned. - - model: trained ensembled model object - ----------- - - Warnings: - --------- - - If target variable is multiclass (more than 2 classes), AUC will be returned - as zero (0.0). - - - """ - - - ''' - - ERROR HANDLING STARTS HERE - - ''' - - import logging - - try: - hasattr(logger, 'name') - except: - logger = logging.getLogger('logs') - logger.setLevel(logging.DEBUG) - - # create console handler and set level to debug - if logger.hasHandlers(): - logger.handlers.clear() - - ch = logging.FileHandler('logs.log') - ch.setLevel(logging.DEBUG) - - # create formatter - formatter = logging.Formatter('%(asctime)s:%(levelname)s:%(message)s') - - # add formatter to ch - ch.setFormatter(formatter) - - # add ch to logger - logger.addHandler(ch) - - logger.info("Initializing ensemble_model()") - logger.info("""ensemble_model(estimator={}, method={}, fold={}, n_estimators={}, round={}, choose_better={}, optimize={}, verbose={})""".\ - format(str(estimator), str(method), str(fold), str(n_estimators), str(round), str(choose_better), str(optimize), str(verbose))) - - logger.info("Checking exceptions") - - #exception checking - import sys - - #run_time - import datetime, time - runtime_start = time.time() - - #Check for allowed method - available_method = ['Bagging', 'Boosting'] - if method not in available_method: - sys.exit("(Value Error): Method parameter only accepts two values 'Bagging' or 'Boosting'.") - - - #check boosting conflict - if method == 'Boosting': - - from sklearn.ensemble import AdaBoostClassifier - - try: - if hasattr(estimator,'n_classes_'): - if estimator.n_classes_ > 2: - check_model = estimator.estimator - check_model = AdaBoostClassifier(check_model, n_estimators=10, random_state=seed) - from sklearn.multiclass import OneVsRestClassifier - check_model = OneVsRestClassifier(check_model) - check_model.fit(X_train, y_train) - else: - check_model = AdaBoostClassifier(estimator, n_estimators=10, random_state=seed) - check_model.fit(X_train, y_train) - except: - sys.exit("(Type Error): Estimator does not provide class_weights or predict_proba function and hence not supported for the Boosting method. Change the estimator or method to 'Bagging'.") - - #checking fold parameter - if type(fold) is not int: - sys.exit('(Type Error): Fold parameter only accepts integer value.') - - #checking n_estimators parameter - if type(n_estimators) is not int: - sys.exit('(Type Error): n_estimators parameter only accepts integer value.') - - #checking round parameter - if type(round) is not int: - sys.exit('(Type Error): Round parameter only accepts integer value.') - - #checking verbose parameter - if type(verbose) is not bool: - sys.exit('(Type Error): Verbose parameter can only take argument as True or False.') - - ''' - - ERROR HANDLING ENDS HERE - - ''' - - logger.info("Preloading libraries") - - #pre-load libraries - import pandas as pd - import ipywidgets as ipw - from IPython.display import display, HTML, clear_output, update_display - - logger.info("Preparing display monitor") - - #progress bar - progress = ipw.IntProgress(value=0, min=0, max=fold+4, step=1 , description='Processing: ') - master_display = pd.DataFrame(columns=['Accuracy','AUC','Recall', 'Prec.', 'F1', 'Kappa', 'MCC']) - if verbose: - if html_param: - display(progress) - - #display monitor - timestampStr = datetime.datetime.now().strftime("%H:%M:%S") - monitor = pd.DataFrame( [ ['Initiated' , '. . . . . . . . . . . . . . . . . .', timestampStr ], - ['Status' , '. . . . . . . . . . . . . . . . . .' , 'Loading Dependencies' ], - ['ETC' , '. . . . . . . . . . . . . . . . . .', 'Calculating ETC'] ], - columns=['', ' ', ' ']).set_index('') - - if verbose: - if html_param: - display(monitor, display_id = 'monitor') - - if verbose: - if html_param: - display_ = display(master_display, display_id=True) - display_id = display_.display_id - - logger.info("Importing libraries") - - #dependencies - import numpy as np - from sklearn import metrics - from sklearn.model_selection import StratifiedKFold - - #ignore warnings - import warnings - warnings.filterwarnings('ignore') - - logger.info("Copying training dataset") - - #Storing X_train and y_train in data_X and data_y parameter - data_X = X_train.copy() - data_y = y_train.copy() - - #reset index - data_X.reset_index(drop=True, inplace=True) - data_y.reset_index(drop=True, inplace=True) - - progress.value += 1 - - #defining estimator as model - model = estimator - - if optimize == 'Accuracy': - compare_dimension = 'Accuracy' - elif optimize == 'AUC': - compare_dimension = 'AUC' - elif optimize == 'Recall': - compare_dimension = 'Recall' - elif optimize == 'Precision': - compare_dimension = 'Prec.' - elif optimize == 'F1': - compare_dimension = 'F1' - elif optimize == 'Kappa': - compare_dimension = 'Kappa' - elif optimize == 'MCC': - compare_dimension = 'MCC' - - logger.info("Checking base model") - - def get_model_name(e): - return str(e).split("(")[0] - - if y.value_counts().count() > 2: - mn = get_model_name(estimator.estimator) - else: - mn = get_model_name(estimator) - - if 'catboost' in str(estimator): - mn = 'CatBoostClassifier' - - model_dict = {'ExtraTreesClassifier' : 'et', - 'GradientBoostingClassifier' : 'gbc', - 'RandomForestClassifier' : 'rf', - 'LGBMClassifier' : 'lightgbm', - 'XGBClassifier' : 'xgboost', - 'AdaBoostClassifier' : 'ada', - 'DecisionTreeClassifier' : 'dt', - 'RidgeClassifier' : 'ridge', - 'LogisticRegression' : 'lr', - 'KNeighborsClassifier' : 'knn', - 'GaussianNB' : 'nb', - 'SGDClassifier' : 'svm', - 'SVC' : 'rbfsvm', - 'GaussianProcessClassifier' : 'gpc', - 'MLPClassifier' : 'mlp', - 'QuadraticDiscriminantAnalysis' : 'qda', - 'LinearDiscriminantAnalysis' : 'lda', - 'CatBoostClassifier' : 'catboost', - 'BaggingClassifier' : 'Bagging'} - - estimator__ = model_dict.get(mn) - - model_dict_logging = {'ExtraTreesClassifier' : 'Extra Trees Classifier', - 'GradientBoostingClassifier' : 'Gradient Boosting Classifier', - 'RandomForestClassifier' : 'Random Forest Classifier', - 'LGBMClassifier' : 'Light Gradient Boosting Machine', - 'XGBClassifier' : 'Extreme Gradient Boosting', - 'AdaBoostClassifier' : 'Ada Boost Classifier', - 'DecisionTreeClassifier' : 'Decision Tree Classifier', - 'RidgeClassifier' : 'Ridge Classifier', - 'LogisticRegression' : 'Logistic Regression', - 'KNeighborsClassifier' : 'K Neighbors Classifier', - 'GaussianNB' : 'Naive Bayes', - 'SGDClassifier' : 'SVM - Linear Kernel', - 'SVC' : 'SVM - Radial Kernel', - 'GaussianProcessClassifier' : 'Gaussian Process Classifier', - 'MLPClassifier' : 'MLP Classifier', - 'QuadraticDiscriminantAnalysis' : 'Quadratic Discriminant Analysis', - 'LinearDiscriminantAnalysis' : 'Linear Discriminant Analysis', - 'CatBoostClassifier' : 'CatBoost Classifier', - 'BaggingClassifier' : 'Bagging Classifier'} - - logger.info('Base model : ' + str(model_dict_logging.get(mn))) - - ''' - MONITOR UPDATE STARTS - ''' - - monitor.iloc[1,1:] = 'Selecting Estimator' - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - ''' - MONITOR UPDATE ENDS - ''' - - if hasattr(estimator,'n_classes_'): - if estimator.n_classes_ > 2: - model = estimator.estimator - - logger.info("Importing untrained ensembler") - - if method == 'Bagging': - from sklearn.ensemble import BaggingClassifier - model = BaggingClassifier(model,bootstrap=True,n_estimators=n_estimators, random_state=seed, n_jobs=n_jobs_param) - logger.info("BaggingClassifier() succesfully imported") - - else: - from sklearn.ensemble import AdaBoostClassifier - model = AdaBoostClassifier(model, n_estimators=n_estimators, random_state=seed) - logger.info("AdaBoostClassifier() succesfully imported") - - if y.value_counts().count() > 2: - from sklearn.multiclass import OneVsRestClassifier - model = OneVsRestClassifier(model) - logger.info("OneVsRestClassifier() succesfully imported") - - progress.value += 1 - - ''' - MONITOR UPDATE STARTS - ''' - - monitor.iloc[1,1:] = 'Initializing CV' - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - ''' - MONITOR UPDATE ENDS - ''' - logger.info("Defining folds") - kf = StratifiedKFold(fold, random_state=seed, shuffle=folds_shuffle_param) - - logger.info("Declaring metric variables") - score_auc =np.empty((0,0)) - score_acc =np.empty((0,0)) - score_recall =np.empty((0,0)) - score_precision =np.empty((0,0)) - score_f1 =np.empty((0,0)) - score_kappa =np.empty((0,0)) - score_mcc =np.empty((0,0)) - score_training_time =np.empty((0,0)) - avgs_auc =np.empty((0,0)) - avgs_acc =np.empty((0,0)) - avgs_recall =np.empty((0,0)) - avgs_precision =np.empty((0,0)) - avgs_f1 =np.empty((0,0)) - avgs_kappa =np.empty((0,0)) - avgs_mcc =np.empty((0,0)) - avgs_training_time =np.empty((0,0)) - - - fold_num = 1 - - for train_i , test_i in kf.split(data_X,data_y): - - logger.info("Initializing Fold " + str(fold_num)) - - t0 = time.time() - - ''' - MONITOR UPDATE STARTS - ''' - - monitor.iloc[1,1:] = 'Fitting Fold ' + str(fold_num) + ' of ' + str(fold) - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - ''' - MONITOR UPDATE ENDS - ''' - - Xtrain,Xtest = data_X.iloc[train_i], data_X.iloc[test_i] - ytrain,ytest = data_y.iloc[train_i], data_y.iloc[test_i] - time_start=time.time() - - if fix_imbalance_param: - logger.info("Initializing SMOTE") - - if fix_imbalance_method_param is None: - from imblearn.over_sampling import SMOTE - resampler = SMOTE(random_state=seed) - else: - resampler = fix_imbalance_method_param - - Xtrain,ytrain = resampler.fit_sample(Xtrain, ytrain) - logger.info("Resampling completed") - - if hasattr(model, 'predict_proba'): - logger.info("Fitting Model") - model.fit(Xtrain,ytrain) - logger.info("Evaluating Metrics") - pred_prob = model.predict_proba(Xtest) - pred_prob = pred_prob[:,1] - pred_ = model.predict(Xtest) - sca = metrics.accuracy_score(ytest,pred_) - - if y.value_counts().count() > 2: - sc = 0 - recall = metrics.recall_score(ytest,pred_, average='macro') - precision = metrics.precision_score(ytest,pred_, average = 'weighted') - f1 = metrics.f1_score(ytest,pred_, average='weighted') - - else: - try: - sc = metrics.roc_auc_score(ytest,pred_prob) - except: - sc = 0 - logger.warning("model has no predict_proba attribute. AUC set to 0.00") - recall = metrics.recall_score(ytest,pred_) - precision = metrics.precision_score(ytest,pred_) - f1 = metrics.f1_score(ytest,pred_) - else: - logger.info("Fitting Model") - model.fit(Xtrain,ytrain) - logger.info("Evaluating Metrics") - pred_prob = 0.00 - logger.warning("model has no predict_proba attribute. pred_prob set to 0.00") - pred_ = model.predict(Xtest) - sca = metrics.accuracy_score(ytest,pred_) - - if y.value_counts().count() > 2: - sc = 0 - recall = metrics.recall_score(ytest,pred_, average='macro') - precision = metrics.precision_score(ytest,pred_, average = 'weighted') - f1 = metrics.f1_score(ytest,pred_, average='weighted') - - else: - try: - sc = metrics.roc_auc_score(ytest,pred_prob) - except: - sc = 0 - logger.warning("model has no predict_proba attribute. AUC set to 0.00") - recall = metrics.recall_score(ytest,pred_) - precision = metrics.precision_score(ytest,pred_) - f1 = metrics.f1_score(ytest,pred_) - - logger.info("Compiling Metrics") - time_end=time.time() - kappa = metrics.cohen_kappa_score(ytest,pred_) - mcc = metrics.matthews_corrcoef(ytest,pred_) - training_time=time_end-time_start - score_acc = np.append(score_acc,sca) - score_auc = np.append(score_auc,sc) - score_recall = np.append(score_recall,recall) - score_precision = np.append(score_precision,precision) - score_f1 =np.append(score_f1,f1) - score_kappa =np.append(score_kappa,kappa) - score_mcc =np.append(score_mcc,mcc) - score_training_time =np.append(score_training_time,training_time) - progress.value += 1 - - - ''' - This section is created to update_display() as code loops through the fold defined. - ''' - - fold_results = pd.DataFrame({'Accuracy':[sca], 'AUC': [sc], 'Recall': [recall], - 'Prec.': [precision], 'F1': [f1], 'Kappa': [kappa], 'MCC':[mcc]}).round(round) - master_display = pd.concat([master_display, fold_results],ignore_index=True) - fold_results = [] - - ''' - - TIME CALCULATION SUB-SECTION STARTS HERE - - ''' - t1 = time.time() - - tt = (t1 - t0) * (fold-fold_num) / 60 - tt = np.around(tt, 2) - - if tt < 1: - tt = str(np.around((tt * 60), 2)) - ETC = tt + ' Seconds Remaining' - - else: - tt = str (tt) - ETC = tt + ' Minutes Remaining' - - if verbose: - if html_param: - update_display(ETC, display_id = 'ETC') - - fold_num += 1 - - - ''' - MONITOR UPDATE STARTS - ''' - - monitor.iloc[2,1:] = ETC - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - ''' - MONITOR UPDATE ENDS - ''' - - ''' - - TIME CALCULATION ENDS HERE - - ''' - - if verbose: - if html_param: - update_display(master_display, display_id = display_id) - - ''' - - Update_display() ends here - - ''' - - logger.info("Calculating mean and std") - mean_acc=np.mean(score_acc) - mean_auc=np.mean(score_auc) - mean_recall=np.mean(score_recall) - mean_precision=np.mean(score_precision) - mean_f1=np.mean(score_f1) - mean_kappa=np.mean(score_kappa) - mean_mcc=np.mean(score_mcc) - mean_training_time=np.sum(score_training_time) - std_acc=np.std(score_acc) - std_auc=np.std(score_auc) - std_recall=np.std(score_recall) - std_precision=np.std(score_precision) - std_f1=np.std(score_f1) - std_kappa=np.std(score_kappa) - std_mcc=np.std(score_mcc) - std_training_time=np.std(score_training_time) - - avgs_acc = np.append(avgs_acc, mean_acc) - avgs_acc = np.append(avgs_acc, std_acc) - avgs_auc = np.append(avgs_auc, mean_auc) - avgs_auc = np.append(avgs_auc, std_auc) - avgs_recall = np.append(avgs_recall, mean_recall) - avgs_recall = np.append(avgs_recall, std_recall) - avgs_precision = np.append(avgs_precision, mean_precision) - avgs_precision = np.append(avgs_precision, std_precision) - avgs_f1 = np.append(avgs_f1, mean_f1) - avgs_f1 = np.append(avgs_f1, std_f1) - avgs_kappa = np.append(avgs_kappa, mean_kappa) - avgs_kappa = np.append(avgs_kappa, std_kappa) - - avgs_mcc = np.append(avgs_mcc, mean_mcc) - avgs_mcc = np.append(avgs_mcc, std_mcc) - - avgs_training_time = np.append(avgs_training_time, mean_training_time) - avgs_training_time = np.append(avgs_training_time, std_training_time) - - logger.info("Creating metrics dataframe") - model_results = pd.DataFrame({'Accuracy': score_acc, 'AUC': score_auc, 'Recall' : score_recall, 'Prec.' : score_precision , - 'F1' : score_f1, 'Kappa' : score_kappa, 'MCC':score_mcc}) - model_results_unpivot = pd.melt(model_results,value_vars=['Accuracy', 'AUC', 'Recall', 'Prec.', 'F1', 'Kappa','MCC']) - model_results_unpivot.columns = ['Metric', 'Measure'] - model_avgs = pd.DataFrame({'Accuracy': avgs_acc, 'AUC': avgs_auc, 'Recall' : avgs_recall, 'Prec.' : avgs_precision , - 'F1' : avgs_f1, 'Kappa' : avgs_kappa,'MCC':avgs_mcc},index=['Mean', 'SD']) - - model_results = model_results.append(model_avgs) - model_results = model_results.round(round) - - # yellow the mean - model_results=model_results.style.apply(lambda x: ['background: yellow' if (x.name == 'Mean') else '' for i in x], axis=1) - model_results = model_results.set_precision(round) - - progress.value += 1 - - #refitting the model on complete X_train, y_train - monitor.iloc[1,1:] = 'Finalizing Model' - monitor.iloc[2,1:] = 'Almost Finished' - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - model_fit_start = time.time() - logger.info("Finalizing model") - model.fit(data_X, data_y) - model_fit_end = time.time() - - model_fit_time = np.array(model_fit_end - model_fit_start).round(2) - - #storing results in create_model_container - logger.info("Uploading results into container") - create_model_container.append(model_results.data) - display_container.append(model_results.data) - - #storing results in master_model_container - logger.info("Uploading model into container") - master_model_container.append(model) - - progress.value += 1 - - ''' - When choose_better sets to True. optimize metric in scoregrid is - compared with base model created using create_model so that ensemble_model - functions return the model with better score only. This will ensure - model performance is atleast equivalent to what is seen is compare_models - ''' - if choose_better: - - logger.info("choose_better activated") - - if verbose: - if html_param: - monitor.iloc[1,1:] = 'Compiling Final Results' - monitor.iloc[2,1:] = 'Almost Finished' - update_display(monitor, display_id = 'monitor') - - #creating base model for comparison - logger.info("SubProcess create_model() called ==================================") - base_model = create_model(estimator=estimator, verbose = False, system=False) - logger.info("SubProcess create_model() end ==================================") - base_model_results = create_model_container[-1][compare_dimension][-2:][0] - ensembled_model_results = create_model_container[-2][compare_dimension][-2:][0] - - if ensembled_model_results > base_model_results: - model = model - else: - model = base_model - - #re-instate display_constainer state - display_container.pop(-1) - logger.info("choose_better completed") - - #end runtime - runtime_end = time.time() - runtime = np.array(runtime_end - runtime_start).round(2) - - if logging_param: - - logger.info("Creating MLFlow logs") - - #Creating Logs message monitor - monitor.iloc[1,1:] = 'Creating Logs' - monitor.iloc[2,1:] = 'Almost Finished' - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - - import mlflow - from pathlib import Path - import os - - mlflow.set_experiment(exp_name_log) - full_name = model_dict_logging.get(mn) - - with mlflow.start_run(run_name=full_name) as run: - - # Get active run to log as tag - RunID = mlflow.active_run().info.run_id - - params = model.get_params() - - for i in list(params): - v = params.get(i) - if len(str(v)) > 250: - params.pop(i) - - mlflow.log_params(params) - mlflow.log_metrics({"Accuracy": avgs_acc[0], "AUC": avgs_auc[0], "Recall": avgs_recall[0], "Precision" : avgs_precision[0], - "F1": avgs_f1[0], "Kappa": avgs_kappa[0], "MCC": avgs_mcc[0]}) - - #set tag of compare_models - mlflow.set_tag("Source", "ensemble_model") - - import secrets - URI = secrets.token_hex(nbytes=4) - mlflow.set_tag("URI", URI) - mlflow.set_tag("USI", USI) - mlflow.set_tag("Run Time", runtime) - mlflow.set_tag("Run ID", RunID) - - # Log training time in seconds - mlflow.log_metric("TT", model_fit_time) - - # Log model and transformation pipeline - logger.info("SubProcess save_model() called ==================================") - save_model(model, 'Trained Model', verbose=False) - logger.info("SubProcess save_model() end ==================================") - mlflow.log_artifact('Trained Model' + '.pkl') - size_bytes = Path('Trained Model.pkl').stat().st_size - size_kb = np.round(size_bytes/1000, 2) - mlflow.set_tag("Size KB", size_kb) - os.remove('Trained Model.pkl') - - # Generate hold-out predictions and save as html - holdout = predict_model(model, verbose=False) - holdout_score = pull() - display_container.pop(-1) - holdout_score.to_html('Holdout.html', col_space=65, justify='left') - mlflow.log_artifact('Holdout.html') - os.remove('Holdout.html') - - # Log AUC and Confusion Matrix plot - if log_plots_param: - - logger.info("SubProcess plot_model() called ==================================") - - try: - plot_model(model, plot = 'auc', verbose=False, save=True, system=False) - mlflow.log_artifact('AUC.png') - os.remove("AUC.png") - except: - pass - - try: - plot_model(model, plot = 'confusion_matrix', verbose=False, save=True, system=False) - mlflow.log_artifact('Confusion Matrix.png') - os.remove("Confusion Matrix.png") - except: - pass - - try: - plot_model(model, plot = 'feature', verbose=False, save=True, system=False) - mlflow.log_artifact('Feature Importance.png') - os.remove("Feature Importance.png") - except: - pass - - logger.info("SubProcess plot_model() end ==================================") - - # Log the CV results as model_results.html artifact - model_results.data.to_html('Results.html', col_space=65, justify='left') - mlflow.log_artifact('Results.html') - os.remove('Results.html') - - if verbose: - clear_output() - if html_param: - display(model_results) - else: - print(model_results.data) - else: - clear_output() - - logger.info("create_model_container: " + str(len(create_model_container))) - logger.info("master_model_container: " + str(len(master_model_container))) - logger.info("display_container: " + str(len(display_container))) - - logger.info(str(model)) - logger.info("ensemble_model() succesfully completed......................................") - - return model - -def plot_model(estimator, - plot = 'auc', - save = False, #added in pycaret 2.0.0 - verbose = True, #added in pycaret 2.0.0 - system = True): #added in pycaret 2.0.0 - - - """ - - Description: - ------------ - This function takes a trained model object and returns a plot based on the - test / hold-out set. The process may require the model to be re-trained in - certain cases. See list of plots supported below. - - Model must be created using create_model() or tune_model(). - - Example: - -------- - from pycaret.datasets import get_data - juice = get_data('juice') - experiment_name = setup(data = juice, target = 'Purchase') - lr = create_model('lr') - - plot_model(lr) - - This will return an AUC plot of a trained Logistic Regression model. - - Parameters - ---------- - estimator : object, default = none - A trained model object should be passed as an estimator. - - plot : string, default = auc - Enter abbreviation of type of plot. The current list of plots supported are: - - Plot Name - ------------------ ----------------------- - 'auc' Area Under the Curve - 'threshold' Discrimination Threshold - 'pr' Precision Recall Curve - 'confusion_matrix' Confusion Matrix - 'error' Class Prediction Error - 'class_report' Classification Report - 'boundary' Decision Boundary - 'rfe' Recursive Feature Selection - 'learning' Learning Curve - 'manifold' Manifold Learning - 'calibration' Calibration Curve - 'vc' Validation Curve - 'dimension' Dimension Learning - 'feature' Feature Importance - 'parameter' Model Hyperparameter - - save: Boolean, default = False - When set to True, Plot is saved as a 'png' file in current working directory. - - verbose: Boolean, default = True - Progress bar not shown when verbose set to False. - - system: Boolean, default = True - Must remain True all times. Only to be changed by internal functions. - - Returns: - -------- - - Visual Plot: Prints the visual plot. - ------------ - - Warnings: - --------- - - 'svm' and 'ridge' doesn't support the predict_proba method. As such, AUC and - calibration plots are not available for these estimators. - - - When the 'max_features' parameter of a trained model object is not equal to - the number of samples in training set, the 'rfe' plot is not available. - - - 'calibration', 'threshold', 'manifold' and 'rfe' plots are not available for - multiclass problems. - - - """ - - - ''' - - ERROR HANDLING STARTS HERE - - ''' - - #exception checking - import sys - - import logging - - try: - hasattr(logger, 'name') - except: - logger = logging.getLogger('logs') - logger.setLevel(logging.DEBUG) - - # create console handler and set level to debug - if logger.hasHandlers(): - logger.handlers.clear() - - ch = logging.FileHandler('logs.log') - ch.setLevel(logging.DEBUG) - - # create formatter - formatter = logging.Formatter('%(asctime)s:%(levelname)s:%(message)s') - - # add formatter to ch - ch.setFormatter(formatter) - - # add ch to logger - logger.addHandler(ch) - - logger.info("Initializing plot_model()") - logger.info("""plot_model(estimator={}, plot={}, save={}, verbose={}, system={})""".\ - format(str(estimator), str(plot), str(save), str(verbose), str(system))) - - logger.info("Checking exceptions") - - #checking plots (string) - available_plots = ['auc', 'threshold', 'pr', 'confusion_matrix', 'error', 'class_report', 'boundary', 'rfe', 'learning', - 'manifold', 'calibration', 'vc', 'dimension', 'feature', 'parameter'] - - if plot not in available_plots: - sys.exit('(Value Error): Plot Not Available. Please see docstring for list of available Plots.') - - #multiclass plot exceptions: - multiclass_not_available = ['calibration', 'threshold', 'manifold', 'rfe'] - if y.value_counts().count() > 2: - if plot in multiclass_not_available: - sys.exit('(Value Error): Plot Not Available for multiclass problems. Please see docstring for list of available Plots.') - - #exception for CatBoost - if 'CatBoostClassifier' in str(type(estimator)): - sys.exit('(Estimator Error): CatBoost estimator is not compatible with plot_model function, try using Catboost with interpret_model instead.') - - #checking for auc plot - if not hasattr(estimator, 'predict_proba') and plot == 'auc': - sys.exit('(Type Error): AUC plot not available for estimators with no predict_proba attribute.') - - #checking for auc plot - if not hasattr(estimator, 'predict_proba') and plot == 'auc': - sys.exit('(Type Error): AUC plot not available for estimators with no predict_proba attribute.') - - #checking for calibration plot - if not hasattr(estimator, 'predict_proba') and plot == 'calibration': - sys.exit('(Type Error): Calibration plot not available for estimators with no predict_proba attribute.') - - #checking for rfe - if hasattr(estimator,'max_features') and plot == 'rfe' and estimator.max_features_ != X_train.shape[1]: - sys.exit('(Type Error): RFE plot not available when max_features parameter is not set to None.') - - #checking for feature plot - if not ( hasattr(estimator, 'coef_') or hasattr(estimator,'feature_importances_') ) and plot == 'feature': - sys.exit('(Type Error): Feature Importance plot not available for estimators that doesnt support coef_ or feature_importances_ attribute.') - - ''' - - ERROR HANDLING ENDS HERE - - ''' - - logger.info("Preloading libraries") - #pre-load libraries - import pandas as pd - import ipywidgets as ipw - from IPython.display import display, HTML, clear_output, update_display - - logger.info("Preparing display monitor") - #progress bar - progress = ipw.IntProgress(value=0, min=0, max=5, step=1 , description='Processing: ') - if verbose: - if html_param: - display(progress) - - #ignore warnings - import warnings - warnings.filterwarnings('ignore') - - logger.info("Importing libraries") - #general dependencies - import matplotlib.pyplot as plt - import numpy as np - import pandas as pd - - progress.value += 1 - - #defining estimator as model locally - model = estimator - - progress.value += 1 - - #plots used for logging (controlled through plots_log_param) - #AUC, #Confusion Matrix and #Feature Importance - - logger.info("plot type: " + str(plot)) - - if plot == 'auc': - - from yellowbrick.classifier import ROCAUC - progress.value += 1 - visualizer = ROCAUC(model) - logger.info("Fitting Model") - visualizer.fit(X_train, y_train) - progress.value += 1 - logger.info("Scoring test/hold-out set") - visualizer.score(X_test, y_test) - progress.value += 1 - clear_output() - if save: - logger.info("Saving 'AUC.png' in current active directory") - if system: - visualizer.show(outpath="AUC.png") - else: - visualizer.show(outpath="AUC.png", clear_figure=True) - else: - visualizer.show() - - logger.info("Visual Rendered Successfully") - - elif plot == 'threshold': - - from yellowbrick.classifier import DiscriminationThreshold - progress.value += 1 - visualizer = DiscriminationThreshold(model, random_state=seed) - logger.info("Fitting Model") - visualizer.fit(X_train, y_train) - progress.value += 1 - logger.info("Scoring test/hold-out set") - visualizer.score(X_test, y_test) - progress.value += 1 - clear_output() - if save: - logger.info("Saving 'Threshold Curve.png' in current active directory") - if system: - visualizer.show(outpath="Threshold Curve.png") - else: - visualizer.show(outpath="Threshold Curve.png", clear_figure=True) - else: - visualizer.show() - - logger.info("Visual Rendered Successfully") - - elif plot == 'pr': - - from yellowbrick.classifier import PrecisionRecallCurve - progress.value += 1 - visualizer = PrecisionRecallCurve(model, random_state=seed) - logger.info("Fitting Model") - visualizer.fit(X_train, y_train) - progress.value += 1 - logger.info("Scoring test/hold-out set") - visualizer.score(X_test, y_test) - progress.value += 1 - clear_output() - if save: - logger.info("Saving 'Precision Recall.png' in current active directory") - if system: - visualizer.show(outpath="Precision Recall.png") - else: - visualizer.show(outpath="Precision Recall.png", clear_figure=True) - else: - visualizer.show() - - logger.info("Visual Rendered Successfully") - - elif plot == 'confusion_matrix': - - from yellowbrick.classifier import ConfusionMatrix - progress.value += 1 - visualizer = ConfusionMatrix(model, random_state=seed, fontsize = 15, cmap="Greens") - logger.info("Fitting Model") - visualizer.fit(X_train, y_train) - progress.value += 1 - logger.info("Scoring test/hold-out set") - visualizer.score(X_test, y_test) - progress.value += 1 - clear_output() - if save: - logger.info("Saving 'Confusion Matrix.png' in current active directory") - if system: - visualizer.show(outpath="Confusion Matrix.png") - else: - visualizer.show(outpath="Confusion Matrix.png", clear_figure=True) - else: - visualizer.show() - - logger.info("Visual Rendered Successfully") - - elif plot == 'error': - - from yellowbrick.classifier import ClassPredictionError - progress.value += 1 - visualizer = ClassPredictionError(model, random_state=seed) - logger.info("Fitting Model") - visualizer.fit(X_train, y_train) - progress.value += 1 - logger.info("Scoring test/hold-out set") - visualizer.score(X_test, y_test) - progress.value += 1 - clear_output() - if save: - logger.info("Saving 'Class Prediction Error.png' in current active directory") - if system: - visualizer.show(outpath="Class Prediction Error.png") - else: - visualizer.show(outpath="Class Prediction Error.png", clear_figure=True) - else: - visualizer.show() - - logger.info("Visual Rendered Successfully") - - elif plot == 'class_report': - - from yellowbrick.classifier import ClassificationReport - progress.value += 1 - visualizer = ClassificationReport(model, random_state=seed, support=True) - logger.info("Fitting Model") - visualizer.fit(X_train, y_train) - progress.value += 1 - logger.info("Scoring test/hold-out set") - visualizer.score(X_test, y_test) - progress.value += 1 - clear_output() - if save: - logger.info("Saving 'Classification Report.png' in current active directory") - if system: - visualizer.show(outpath="Classification Report.png") - else: - visualizer.show(outpath="Classification Report.png", clear_figure=True) - else: - visualizer.show() - - logger.info("Visual Rendered Successfully") - - elif plot == 'boundary': - - from sklearn.preprocessing import StandardScaler - from sklearn.decomposition import PCA - from yellowbrick.contrib.classifier import DecisionViz - from copy import deepcopy - model2 = deepcopy(estimator) - - progress.value += 1 - - X_train_transformed = X_train.copy() - X_test_transformed = X_test.copy() - X_train_transformed = X_train_transformed.select_dtypes(include='float64') - X_test_transformed = X_test_transformed.select_dtypes(include='float64') - logger.info("Fitting StandardScaler()") - X_train_transformed = StandardScaler().fit_transform(X_train_transformed) - X_test_transformed = StandardScaler().fit_transform(X_test_transformed) - pca = PCA(n_components=2, random_state = seed) - logger.info("Fitting PCA()") - X_train_transformed = pca.fit_transform(X_train_transformed) - X_test_transformed = pca.fit_transform(X_test_transformed) - - progress.value += 1 - - y_train_transformed = y_train.copy() - y_test_transformed = y_test.copy() - y_train_transformed = np.array(y_train_transformed) - y_test_transformed = np.array(y_test_transformed) - - viz_ = DecisionViz(model2) - logger.info("Fitting Model") - viz_.fit(X_train_transformed, y_train_transformed, features=['Feature One', 'Feature Two'], classes=['A', 'B']) - viz_.draw(X_test_transformed, y_test_transformed) - progress.value += 1 - clear_output() - if save: - logger.info("Saving 'Decision Boundary.png' in current active directory") - if system: - viz_.show(outpath="Decision Boundary.png") - else: - viz_.show(outpath="Decision Boundary.png", clear_figure=True) - else: - viz_.show() - - logger.info("Visual Rendered Successfully") - - elif plot == 'rfe': - - from yellowbrick.model_selection import RFECV - progress.value += 1 - visualizer = RFECV(model, cv=10) - progress.value += 1 - logger.info("Fitting Model") - visualizer.fit(X_train, y_train) - progress.value += 1 - clear_output() - if save: - logger.info("Saving 'Recursive Feature Selection.png' in current active directory") - if system: - visualizer.show(outpath="Recursive Feature Selection.png") - else: - visualizer.show(outpath="Recursive Feature Selection.png", clear_figure=True) - else: - visualizer.show() - - logger.info("Visual Rendered Successfully") - - elif plot == 'learning': - - from yellowbrick.model_selection import LearningCurve - progress.value += 1 - sizes = np.linspace(0.3, 1.0, 10) - visualizer = LearningCurve(model, cv=10, train_sizes=sizes, n_jobs=n_jobs_param, random_state=seed) - progress.value += 1 - logger.info("Fitting Model") - visualizer.fit(X_train, y_train) - progress.value += 1 - clear_output() - if save: - logger.info("Saving 'Learning Curve.png' in current active directory") - if system: - visualizer.show(outpath="Learning Curve.png") - else: - visualizer.show(outpath="Learning Curve.png", clear_figure=True) - else: - visualizer.show() - - logger.info("Visual Rendered Successfully") - - elif plot == 'manifold': - - from yellowbrick.features import Manifold - - progress.value += 1 - X_train_transformed = X_train.select_dtypes(include='float64') - visualizer = Manifold(manifold='tsne', random_state = seed) - progress.value += 1 - logger.info("Fitting Model") - visualizer.fit_transform(X_train_transformed, y_train) - progress.value += 1 - clear_output() - if save: - logger.info("Saving 'Manifold Plot.png' in current active directory") - if system: - visualizer.show(outpath="Manifold Plot.png") - else: - visualizer.show(outpath="Manifold Plot.png", clear_figure=True) - else: - visualizer.show() - - logger.info("Visual Rendered Successfully") - - elif plot == 'calibration': - - from sklearn.calibration import calibration_curve - - model_name = str(model).split("(")[0] - - plt.figure(figsize=(7, 6)) - ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2) - - ax1.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated") - progress.value += 1 - logger.info("Scoring test/hold-out set") - prob_pos = model.predict_proba(X_test)[:, 1] - prob_pos = (prob_pos - prob_pos.min()) / (prob_pos.max() - prob_pos.min()) - fraction_of_positives, mean_predicted_value = calibration_curve(y_test, prob_pos, n_bins=10) - progress.value += 1 - ax1.plot(mean_predicted_value, fraction_of_positives, "s-",label="%s" % (model_name, )) - - ax1.set_ylabel("Fraction of positives") - ax1.set_ylim([0, 1]) - ax1.set_xlim([0, 1]) - ax1.legend(loc="lower right") - ax1.set_title('Calibration plots (reliability curve)') - ax1.set_facecolor('white') - ax1.grid(b=True, color='grey', linewidth=0.5, linestyle = '-') - plt.tight_layout() - progress.value += 1 - clear_output() - if save: - logger.info("Saving 'Calibration Plot.png' in current active directory") - if system: - plt.savefig("Calibration Plot.png") - else: - plt.show() - else: - plt.show() - - logger.info("Visual Rendered Successfully") - - elif plot == 'vc': - - model_name = str(model).split("(")[0] - - logger.info("Determining param_name") - - #SGD Classifier - if model_name == 'SGDClassifier': - param_name='l1_ratio' - param_range = np.arange(0,1, 0.01) - - elif model_name == 'LinearDiscriminantAnalysis': - sys.exit('(Value Error): Shrinkage Parameter not supported in Validation Curve Plot.') - - #tree based models - elif hasattr(model, 'max_depth'): - param_name='max_depth' - param_range = np.arange(1,11) - - #knn - elif hasattr(model, 'n_neighbors'): - param_name='n_neighbors' - param_range = np.arange(1,11) - - #MLP / Ridge - elif hasattr(model, 'alpha'): - param_name='alpha' - param_range = np.arange(0,1,0.1) - - #Logistic Regression - elif hasattr(model, 'C'): - param_name='C' - param_range = np.arange(1,11) - - #Bagging / Boosting - elif hasattr(model, 'n_estimators'): - param_name='n_estimators' - param_range = np.arange(1,100,10) - - #Bagging / Boosting / gbc / ada / - elif hasattr(model, 'n_estimators'): - param_name='n_estimators' - param_range = np.arange(1,100,10) - - #Naive Bayes - elif hasattr(model, 'var_smoothing'): - param_name='var_smoothing' - param_range = np.arange(0.1, 1, 0.01) - - #QDA - elif hasattr(model, 'reg_param'): - param_name='reg_param' - param_range = np.arange(0,1,0.1) - - #GPC - elif hasattr(model, 'max_iter_predict'): - param_name='max_iter_predict' - param_range = np.arange(100,1000,100) - - else: - clear_output() - sys.exit('(Type Error): Plot not supported for this estimator. Try different estimator.') - - logger.info("param_name: " + str(param_name)) - - progress.value += 1 - - from yellowbrick.model_selection import ValidationCurve - viz = ValidationCurve(model, param_name=param_name, param_range=param_range,cv=10, - random_state=seed) - logger.info("Fitting Model") - viz.fit(X_train, y_train) - progress.value += 1 - clear_output() - if save: - logger.info("Saving 'Validation Curve.png' in current active directory") - if system: - viz.show(outpath="Validation Curve.png") - else: - viz.show(outpath="Validation Curve.png", clear_figure=True) - else: - viz.show() - - logger.info("Visual Rendered Successfully") - - elif plot == 'dimension': - - from yellowbrick.features import RadViz - from sklearn.preprocessing import StandardScaler - from sklearn.decomposition import PCA - progress.value += 1 - X_train_transformed = X_train.select_dtypes(include='float64') - logger.info("Fitting StandardScaler()") - X_train_transformed = StandardScaler().fit_transform(X_train_transformed) - y_train_transformed = np.array(y_train) - - features=min(round(len(X_train.columns) * 0.3,0),5) - features = int(features) - - pca = PCA(n_components=features, random_state=seed) - logger.info("Fitting PCA()") - X_train_transformed = pca.fit_transform(X_train_transformed) - progress.value += 1 - classes = y_train.unique().tolist() - visualizer = RadViz(classes=classes, alpha=0.25) - logger.info("Fitting Model") - visualizer.fit(X_train_transformed, y_train_transformed) - visualizer.transform(X_train_transformed) - progress.value += 1 - clear_output() - if save: - logger.info("Saving 'Dimension Plot.png' in current active directory") - if system: - visualizer.show(outpath="Dimension Plot.png") - else: - visualizer.show(outpath="Dimension Plot.png", clear_figure=True) - else: - visualizer.show() - - logger.info("Visual Rendered Successfully") - - elif plot == 'feature': - - if hasattr(estimator,'coef_'): - variables = abs(model.coef_[0]) - else: - logger.warning("No coef_ found. Trying feature_importances_") - variables = abs(model.feature_importances_) - col_names = np.array(X_train.columns) - coef_df = pd.DataFrame({'Variable': X_train.columns, 'Value': variables}) - sorted_df = coef_df.sort_values(by='Value') - sorted_df = sorted_df.sort_values(by='Value', ascending=False) - sorted_df = sorted_df.head(10) - sorted_df = sorted_df.sort_values(by='Value') - my_range=range(1,len(sorted_df.index)+1) - progress.value += 1 - plt.figure(figsize=(8,5)) - plt.hlines(y=my_range, xmin=0, xmax=sorted_df['Value'], color='skyblue') - plt.plot(sorted_df['Value'], my_range, "o") - progress.value += 1 - plt.yticks(my_range, sorted_df['Variable']) - plt.title("Feature Importance Plot") - plt.xlabel('Variable Importance') - plt.ylabel('Features') - progress.value += 1 - clear_output() - if save: - logger.info("Saving 'Feature Importance.png' in current active directory") - if system: - plt.savefig("Feature Importance.png") - else: - plt.savefig("Feature Importance.png") - plt.close() - else: - plt.show() - - logger.info("Visual Rendered Successfully") - - elif plot == 'parameter': - - clear_output() - param_df = pd.DataFrame.from_dict(estimator.get_params(estimator), orient='index', columns=['Parameters']) - display(param_df) - logger.info("Visual Rendered Successfully") - - logger.info("plot_model() succesfully completed......................................") - -def compare_models(blacklist = None, - whitelist = None, #added in pycaret==2.0.0 - fold = 10, - round = 4, - sort = 'Accuracy', - n_select = 1, #added in pycaret==2.0.0 - turbo = True, - verbose = True): #added in pycaret==2.0.0 - - """ - - Description: - ------------ - This function train all the models available in the model library and scores them - using Stratified Cross Validation. The output prints a score grid with Accuracy, - AUC, Recall, Precision, F1, Kappa and MCC (averaged accross folds), determined by - fold parameter. - - This function returns the best model based on metric defined in sort parameter. - - To select top N models, use n_select parameter that is set to 1 by default. - Where n_select parameter > 1, it will return a list of trained model objects. - - When turbo is set to True ('rbfsvm', 'gpc' and 'mlp') are excluded due to longer - training time. By default turbo param is set to True. - - Example: - -------- - from pycaret.datasets import get_data - juice = get_data('juice') - experiment_name = setup(data = juice, target = 'Purchase') - - best_model = compare_models() - - This will return the averaged score grid of all the models except 'rbfsvm', 'gpc' - and 'mlp'. When turbo param is set to False, all models including 'rbfsvm', 'gpc' - and 'mlp' are used but this may result in longer training time. - - best_model = compare_models( blacklist = [ 'knn', 'gbc' ] , turbo = False) - - This will return a comparison of all models except K Nearest Neighbour and - Gradient Boosting Classifier. - - best_model = compare_models( blacklist = [ 'knn', 'gbc' ] , turbo = True) - - This will return comparison of all models except K Nearest Neighbour, - Gradient Boosting Classifier, SVM (RBF), Gaussian Process Classifier and - Multi Level Perceptron. - - - Parameters - ---------- - blacklist: list of strings, default = None - In order to omit certain models from the comparison model ID's can be passed as - a list of strings in blacklist param. - - whitelist: list of strings, default = None - In order to run only certain models for the comparison, the model ID's can be - passed as a list of strings in whitelist param. - - fold: integer, default = 10 - Number of folds to be used in Kfold CV. Must be at least 2. - - round: integer, default = 4 - Number of decimal places the metrics in the score grid will be rounded to. - - sort: string, default = 'Accuracy' - The scoring measure specified is used for sorting the average score grid - Other options are 'AUC', 'Recall', 'Precision', 'F1', 'Kappa' and 'MCC'. - - n_select: int, default = 1 - Number of top_n models to return. use negative argument for bottom selection. - for example, n_select = -3 means bottom 3 models. - - turbo: Boolean, default = True - When turbo is set to True, it blacklists estimators that have longer - training time. - - verbose: Boolean, default = True - Score grid is not printed when verbose is set to False. - - Returns: - -------- - - score grid: A table containing the scores of the model across the kfolds. - ----------- Scoring metrics used are Accuracy, AUC, Recall, Precision, F1, - Kappa and MCC. Mean and standard deviation of the scores across - the folds are also returned. - - Warnings: - --------- - - compare_models() though attractive, might be time consuming with large - datasets. By default turbo is set to True, which blacklists models that - have longer training times. Changing turbo parameter to False may result - in very high training times with datasets where number of samples exceed - 10,000. - - - If target variable is multiclass (more than 2 classes), AUC will be - returned as zero (0.0) - - - """ - - ''' - - ERROR HANDLING STARTS HERE - - ''' - - import logging - - try: - hasattr(logger, 'name') - except: - logger = logging.getLogger('logs') - logger.setLevel(logging.DEBUG) - - # create console handler and set level to debug - if logger.hasHandlers(): - logger.handlers.clear() - - ch = logging.FileHandler('logs.log') - ch.setLevel(logging.DEBUG) - - # create formatter - formatter = logging.Formatter('%(asctime)s:%(levelname)s:%(message)s') - - # add formatter to ch - ch.setFormatter(formatter) - - # add ch to logger - logger.addHandler(ch) - - logger.info("Initializing compare_models()") - logger.info("""compare_models(blacklist={}, whitelist={}, fold={}, round={}, sort={}, n_select={}, turbo={}, verbose={})""".\ - format(str(blacklist), str(whitelist), str(fold), str(round), str(sort), str(n_select), str(turbo), str(verbose))) - - logger.info("Checking exceptions") - - #exception checking - import sys - - #checking error for blacklist (string) - available_estimators = ['lr', 'knn', 'nb', 'dt', 'svm', 'rbfsvm', 'gpc', 'mlp', 'ridge', 'rf', 'qda', 'ada', - 'gbc', 'lda', 'et', 'xgboost', 'lightgbm', 'catboost'] - - if blacklist != None: - for i in blacklist: - if i not in available_estimators: - sys.exit('(Value Error): Estimator Not Available. Please see docstring for list of available estimators.') - - if whitelist != None: - for i in whitelist: - if i not in available_estimators: - sys.exit('(Value Error): Estimator Not Available. Please see docstring for list of available estimators.') - - #whitelist and blacklist together check - if whitelist is not None: - if blacklist is not None: - sys.exit('(Type Error): Cannot use blacklist parameter when whitelist is used to compare models.') - - #checking fold parameter - if type(fold) is not int: - sys.exit('(Type Error): Fold parameter only accepts integer value.') - - #checking round parameter - if type(round) is not int: - sys.exit('(Type Error): Round parameter only accepts integer value.') - - #checking sort parameter - allowed_sort = ['Accuracy', 'Recall', 'Precision', 'F1', 'AUC', 'Kappa', 'MCC', 'TT (Sec)'] - if sort not in allowed_sort: - sys.exit('(Value Error): Sort method not supported. See docstring for list of available parameters.') - - #checking optimize parameter for multiclass - if y.value_counts().count() > 2: - if sort == 'AUC': - sys.exit('(Type Error): AUC metric not supported for multiclass problems. See docstring for list of other optimization parameters.') - - ''' - - ERROR HANDLING ENDS HERE - - ''' - - logger.info("Preloading libraries") - - #pre-load libraries - import pandas as pd - import time, datetime - import ipywidgets as ipw - from IPython.display import display, HTML, clear_output, update_display - - pd.set_option('display.max_columns', 500) - - logger.info("Preparing display monitor") - - #progress bar - if blacklist is None: - len_of_blacklist = 0 - else: - len_of_blacklist = len(blacklist) - - if turbo: - len_mod = 15 - len_of_blacklist - else: - len_mod = 18 - len_of_blacklist - - #n_select param - if type(n_select) is list: - n_select_num = len(n_select) - else: - n_select_num = abs(n_select) - - if n_select_num > len_mod: - n_select_num = len_mod - - if whitelist is not None: - wl = len(whitelist) - bl = len_of_blacklist - len_mod = wl - bl - - if whitelist is not None: - opt = 10 - else: - opt = 25 - - progress = ipw.IntProgress(value=0, min=0, max=(fold*len_mod)+opt+n_select_num, step=1 , description='Processing: ') - master_display = pd.DataFrame(columns=['Model', 'Accuracy','AUC','Recall', 'Prec.', 'F1', 'Kappa', 'MCC', 'TT (Sec)']) - if verbose: - if html_param: - display(progress) - - #display monitor - timestampStr = datetime.datetime.now().strftime("%H:%M:%S") - monitor = pd.DataFrame( [ ['Initiated' , '. . . . . . . . . . . . . . . . . .', timestampStr ], - ['Status' , '. . . . . . . . . . . . . . . . . .' , 'Loading Dependencies' ], - ['Estimator' , '. . . . . . . . . . . . . . . . . .' , 'Compiling Library' ], - ['ETC' , '. . . . . . . . . . . . . . . . . .', 'Calculating ETC'] ], - columns=['', ' ', ' ']).set_index('') - - if verbose: - if html_param: - display(monitor, display_id = 'monitor') - display_ = display(master_display, display_id=True) - display_id = display_.display_id - - #ignore warnings - import warnings - warnings.filterwarnings('ignore') - - #general dependencies - import numpy as np - import random - from sklearn import metrics - from sklearn.model_selection import StratifiedKFold - import pandas.io.formats.style - - logger.info("Copying training dataset") - #defining X_train and y_train as data_X and data_y - data_X = X_train - data_y=y_train - - progress.value += 1 - - logger.info("Importing libraries") - - #import sklearn dependencies - from sklearn.linear_model import LogisticRegression - from sklearn.neighbors import KNeighborsClassifier - from sklearn.naive_bayes import GaussianNB - from sklearn.tree import DecisionTreeClassifier - from sklearn.linear_model import SGDClassifier - from sklearn.svm import SVC - from sklearn.gaussian_process import GaussianProcessClassifier - from sklearn.neural_network import MLPClassifier - from sklearn.linear_model import RidgeClassifier - from sklearn.ensemble import RandomForestClassifier - from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis - from sklearn.ensemble import AdaBoostClassifier - from sklearn.ensemble import GradientBoostingClassifier - from sklearn.discriminant_analysis import LinearDiscriminantAnalysis - from sklearn.ensemble import ExtraTreesClassifier - from sklearn.multiclass import OneVsRestClassifier - from xgboost import XGBClassifier - from catboost import CatBoostClassifier - try: - import lightgbm as lgb - except: - pass - logger.info("LightGBM import failed") - - - progress.value += 1 - - #defining sort parameter (making Precision equivalent to Prec. ) - if sort == 'Precision': - sort = 'Prec.' - else: - sort = sort - - - ''' - MONITOR UPDATE STARTS - ''' - - monitor.iloc[1,1:] = 'Loading Estimator' - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - ''' - MONITOR UPDATE ENDS - ''' - - logger.info("Importing untrained models") - - #creating model object - lr = LogisticRegression(random_state=seed) #dont add n_jobs_param here. It slows doesn Logistic Regression somehow. - knn = KNeighborsClassifier(n_jobs=n_jobs_param) - nb = GaussianNB() - dt = DecisionTreeClassifier(random_state=seed) - svm = SGDClassifier(max_iter=1000, tol=0.001, random_state=seed, n_jobs=n_jobs_param) - rbfsvm = SVC(gamma='auto', C=1, probability=True, kernel='rbf', random_state=seed) - gpc = GaussianProcessClassifier(random_state=seed, n_jobs=n_jobs_param) - mlp = MLPClassifier(max_iter=500, random_state=seed) - ridge = RidgeClassifier(random_state=seed) - rf = RandomForestClassifier(n_estimators=10, random_state=seed, n_jobs=n_jobs_param) - qda = QuadraticDiscriminantAnalysis() - ada = AdaBoostClassifier(random_state=seed) - gbc = GradientBoostingClassifier(random_state=seed) - lda = LinearDiscriminantAnalysis() - et = ExtraTreesClassifier(random_state=seed, n_jobs=n_jobs_param) - xgboost = XGBClassifier(random_state=seed, verbosity=0, n_jobs=n_jobs_param) - lightgbm = lgb.LGBMClassifier(random_state=seed, n_jobs=n_jobs_param) - catboost = CatBoostClassifier(random_state=seed, silent = True, thread_count=n_jobs_param) - - logger.info("Import successful") - - progress.value += 1 - - model_dict = {'Logistic Regression' : 'lr', - 'Linear Discriminant Analysis' : 'lda', - 'Ridge Classifier' : 'ridge', - 'Extreme Gradient Boosting' : 'xgboost', - 'Ada Boost Classifier' : 'ada', - 'CatBoost Classifier' : 'catboost', - 'Light Gradient Boosting Machine' : 'lightgbm', - 'Gradient Boosting Classifier' : 'gbc', - 'Random Forest Classifier' : 'rf', - 'Naive Bayes' : 'nb', - 'Extra Trees Classifier' : 'et', - 'Decision Tree Classifier' : 'dt', - 'K Neighbors Classifier' : 'knn', - 'Quadratic Discriminant Analysis' : 'qda', - 'SVM - Linear Kernel' : 'svm', - 'Gaussian Process Classifier' : 'gpc', - 'MLP Classifier' : 'mlp', - 'SVM - Radial Kernel' : 'rbfsvm'} - - model_library = [lr, knn, nb, dt, svm, rbfsvm, gpc, mlp, ridge, rf, qda, ada, gbc, lda, et, xgboost, lightgbm, catboost] - - model_names = ['Logistic Regression', - 'K Neighbors Classifier', - 'Naive Bayes', - 'Decision Tree Classifier', - 'SVM - Linear Kernel', - 'SVM - Radial Kernel', - 'Gaussian Process Classifier', - 'MLP Classifier', - 'Ridge Classifier', - 'Random Forest Classifier', - 'Quadratic Discriminant Analysis', - 'Ada Boost Classifier', - 'Gradient Boosting Classifier', - 'Linear Discriminant Analysis', - 'Extra Trees Classifier', - 'Extreme Gradient Boosting', - 'Light Gradient Boosting Machine', - 'CatBoost Classifier'] - - #checking for blacklist models - - model_library_str = ['lr', 'knn', 'nb', 'dt', 'svm', - 'rbfsvm', 'gpc', 'mlp', 'ridge', - 'rf', 'qda', 'ada', 'gbc', 'lda', - 'et', 'xgboost', 'lightgbm', 'catboost'] - - model_library_str_ = ['lr', 'knn', 'nb', 'dt', 'svm', - 'rbfsvm', 'gpc', 'mlp', 'ridge', - 'rf', 'qda', 'ada', 'gbc', 'lda', - 'et', 'xgboost', 'lightgbm', 'catboost'] - - if blacklist is not None: - - if turbo: - internal_blacklist = ['rbfsvm', 'gpc', 'mlp'] - compiled_blacklist = blacklist + internal_blacklist - blacklist = list(set(compiled_blacklist)) - - else: - blacklist = blacklist - - for i in blacklist: - model_library_str_.remove(i) - - si = [] - - for i in model_library_str_: - s = model_library_str.index(i) - si.append(s) - - model_library_ = [] - model_names_= [] - for i in si: - model_library_.append(model_library[i]) - model_names_.append(model_names[i]) - - model_library = model_library_ - model_names = model_names_ - - - if blacklist is None and turbo is True: - - model_library = [lr, knn, nb, dt, svm, ridge, rf, qda, ada, gbc, lda, et, xgboost, lightgbm, catboost] - - model_names = ['Logistic Regression', - 'K Neighbors Classifier', - 'Naive Bayes', - 'Decision Tree Classifier', - 'SVM - Linear Kernel', - 'Ridge Classifier', - 'Random Forest Classifier', - 'Quadratic Discriminant Analysis', - 'Ada Boost Classifier', - 'Gradient Boosting Classifier', - 'Linear Discriminant Analysis', - 'Extra Trees Classifier', - 'Extreme Gradient Boosting', - 'Light Gradient Boosting Machine', - 'CatBoost Classifier'] - - #checking for whitelist models - if whitelist is not None: - - model_library = [] - model_names = [] - - for i in whitelist: - if i == 'lr': - model_library.append(lr) - model_names.append('Logistic Regression') - elif i == 'knn': - model_library.append(knn) - model_names.append('K Neighbors Classifier') - elif i == 'nb': - model_library.append(nb) - model_names.append('Naive Bayes') - elif i == 'dt': - model_library.append(dt) - model_names.append('Decision Tree Classifier') - elif i == 'svm': - model_library.append(svm) - model_names.append('SVM - Linear Kernel') - elif i == 'rbfsvm': - model_library.append(rbfsvm) - model_names.append('SVM - Radial Kernel') - elif i == 'gpc': - model_library.append(gpc) - model_names.append('Gaussian Process Classifier') - elif i == 'mlp': - model_library.append(mlp) - model_names.append('MLP Classifier') - elif i == 'ridge': - model_library.append(ridge) - model_names.append('Ridge Classifier') - elif i == 'rf': - model_library.append(rf) - model_names.append('Random Forest Classifier') - elif i == 'qda': - model_library.append(qda) - model_names.append('Quadratic Discriminant Analysis') - elif i == 'ada': - model_library.append(ada) - model_names.append('Ada Boost Classifier') - elif i == 'gbc': - model_library.append(gbc) - model_names.append('Gradient Boosting Classifier') - elif i == 'lda': - model_library.append(lda) - model_names.append('Linear Discriminant Analysis') - elif i == 'et': - model_library.append(et) - model_names.append('Extra Trees Classifier') - elif i == 'xgboost': - model_library.append(xgboost) - model_names.append('Extreme Gradient Boosting') - elif i == 'lightgbm': - model_library.append(lightgbm) - model_names.append('Light Gradient Boosting Machine') - elif i == 'catboost': - model_library.append(catboost) - model_names.append('CatBoost Classifier') - - #multiclass check - model_library_multiclass = [] - if y.value_counts().count() > 2: - for i in model_library: - model = OneVsRestClassifier(i) - model_library_multiclass.append(model) - - model_library = model_library_multiclass - - progress.value += 1 - - - ''' - MONITOR UPDATE STARTS - ''' - - monitor.iloc[1,1:] = 'Initializing CV' - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - ''' - MONITOR UPDATE ENDS - ''' - - #cross validation setup starts here - logger.info("Defining folds") - kf = StratifiedKFold(fold, random_state=seed, shuffle=folds_shuffle_param) - - logger.info("Declaring metric variables") - score_acc =np.empty((0,0)) - score_auc =np.empty((0,0)) - score_recall =np.empty((0,0)) - score_precision =np.empty((0,0)) - score_f1 =np.empty((0,0)) - score_kappa =np.empty((0,0)) - score_acc_running = np.empty((0,0)) ##running total - score_mcc=np.empty((0,0)) - score_training_time=np.empty((0,0)) - avg_acc = np.empty((0,0)) - avg_auc = np.empty((0,0)) - avg_recall = np.empty((0,0)) - avg_precision = np.empty((0,0)) - avg_f1 = np.empty((0,0)) - avg_kappa = np.empty((0,0)) - avg_mcc=np.empty((0,0)) - avg_training_time=np.empty((0,0)) - - #create URI (before loop) - import secrets - URI = secrets.token_hex(nbytes=4) - - name_counter = 0 - - for model in model_library: - - logger.info("Initializing " + str(model_names[name_counter])) - - #run_time - runtime_start = time.time() - - progress.value += 1 - - ''' - MONITOR UPDATE STARTS - ''' - monitor.iloc[2,1:] = model_names[name_counter] - monitor.iloc[3,1:] = 'Calculating ETC' - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - ''' - MONITOR UPDATE ENDS - ''' - - fold_num = 1 - - for train_i , test_i in kf.split(data_X,data_y): - - logger.info("Initializing Fold " + str(fold_num)) - - progress.value += 1 - - t0 = time.time() - - ''' - MONITOR UPDATE STARTS - ''' - - monitor.iloc[1,1:] = 'Fitting Fold ' + str(fold_num) + ' of ' + str(fold) - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - ''' - MONITOR UPDATE ENDS - ''' - - Xtrain,Xtest = data_X.iloc[train_i], data_X.iloc[test_i] - ytrain,ytest = data_y.iloc[train_i], data_y.iloc[test_i] - - if fix_imbalance_param: - - logger.info("Initializing SMOTE") - - if fix_imbalance_method_param is None: - from imblearn.over_sampling import SMOTE - resampler = SMOTE(random_state = seed) - else: - resampler = fix_imbalance_method_param - - Xtrain,ytrain = resampler.fit_sample(Xtrain, ytrain) - logger.info("Resampling completed") - - if hasattr(model, 'predict_proba'): - time_start=time.time() - logger.info("Fitting Model") - model.fit(Xtrain,ytrain) - logger.info("Evaluating Metrics") - time_end=time.time() - pred_prob = model.predict_proba(Xtest) - pred_prob = pred_prob[:,1] - pred_ = model.predict(Xtest) - sca = metrics.accuracy_score(ytest,pred_) - - if y.value_counts().count() > 2: - sc = 0 - recall = metrics.recall_score(ytest,pred_, average='macro') - precision = metrics.precision_score(ytest,pred_, average = 'weighted') - f1 = metrics.f1_score(ytest,pred_, average='weighted') - - else: - try: - sc = metrics.roc_auc_score(ytest,pred_prob) - except: - sc = 0 - logger.warning("model has no predict_proba attribute. AUC set to 0.00") - recall = metrics.recall_score(ytest,pred_) - precision = metrics.precision_score(ytest,pred_) - f1 = metrics.f1_score(ytest,pred_) - else: - time_start=time.time() - logger.info("Fitting Model") - model.fit(Xtrain,ytrain) - logger.info("Evaluating Metrics") - time_end=time.time() - logger.warning("model has no predict_proba attribute. pred_prob set to 0.00") - pred_prob = 0.00 - pred_ = model.predict(Xtest) - sca = metrics.accuracy_score(ytest,pred_) - - if y.value_counts().count() > 2: - sc = 0 - recall = metrics.recall_score(ytest,pred_, average='macro') - precision = metrics.precision_score(ytest,pred_, average = 'weighted') - f1 = metrics.f1_score(ytest,pred_, average='weighted') - - else: - try: - sc = metrics.roc_auc_score(ytest,pred_prob) - except: - sc = 0 - logger.warning("model has no predict_proba attribute. AUC set to 0.00") - recall = metrics.recall_score(ytest,pred_) - precision = metrics.precision_score(ytest,pred_) - f1 = metrics.f1_score(ytest,pred_) - - logger.info("Compiling Metrics") - mcc = metrics.matthews_corrcoef(ytest,pred_) - kappa = metrics.cohen_kappa_score(ytest,pred_) - training_time= time_end - time_start - score_acc = np.append(score_acc,sca) - score_auc = np.append(score_auc,sc) - score_recall = np.append(score_recall,recall) - score_precision = np.append(score_precision,precision) - score_f1 =np.append(score_f1,f1) - score_kappa =np.append(score_kappa,kappa) - score_mcc=np.append(score_mcc,mcc) - score_training_time=np.append(score_training_time,training_time) - - ''' - TIME CALCULATION SUB-SECTION STARTS HERE - ''' - t1 = time.time() - - tt = (t1 - t0) * (fold-fold_num) / 60 - tt = np.around(tt, 2) - - if tt < 1: - tt = str(np.around((tt * 60), 2)) - ETC = tt + ' Seconds Remaining' - - else: - tt = str (tt) - ETC = tt + ' Minutes Remaining' - - fold_num += 1 - - ''' - MONITOR UPDATE STARTS - ''' - - monitor.iloc[3,1:] = ETC - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - ''' - MONITOR UPDATE ENDS - ''' - logger.info("Calculating mean and std") - avg_acc = np.append(avg_acc,np.mean(score_acc)) - avg_auc = np.append(avg_auc,np.mean(score_auc)) - avg_recall = np.append(avg_recall,np.mean(score_recall)) - avg_precision = np.append(avg_precision,np.mean(score_precision)) - avg_f1 = np.append(avg_f1,np.mean(score_f1)) - avg_kappa = np.append(avg_kappa,np.mean(score_kappa)) - avg_mcc=np.append(avg_mcc,np.mean(score_mcc)) - avg_training_time=np.append(avg_training_time,np.mean(score_training_time)) - - logger.info("Creating metrics dataframe") - compare_models_ = pd.DataFrame({'Model':model_names[name_counter], 'Accuracy':avg_acc, 'AUC':avg_auc, - 'Recall':avg_recall, 'Prec.':avg_precision, - 'F1':avg_f1, 'Kappa': avg_kappa, 'MCC':avg_mcc, 'TT (Sec)':avg_training_time}) - master_display = pd.concat([master_display, compare_models_],ignore_index=True) - master_display = master_display.round(round) - master_display = master_display.sort_values(by=sort,ascending=False) - master_display.reset_index(drop=True, inplace=True) - - if verbose: - if html_param: - update_display(master_display, display_id = display_id) - - #end runtime - runtime_end = time.time() - runtime = np.array(runtime_end - runtime_start).round(2) - - """ - MLflow logging starts here - """ - - if logging_param: - - logger.info("Creating MLFlow logs") - - import mlflow - from pathlib import Path - import os - - run_name = model_names[name_counter] - - with mlflow.start_run(run_name=run_name) as run: - - # Get active run to log as tag - RunID = mlflow.active_run().info.run_id - - params = model.get_params() - - for i in list(params): - v = params.get(i) - if len(str(v)) > 250: - params.pop(i) - - mlflow.log_params(params) - - #set tag of compare_models - mlflow.set_tag("Source", "compare_models") - mlflow.set_tag("URI", URI) - mlflow.set_tag("USI", USI) - mlflow.set_tag("Run Time", runtime) - mlflow.set_tag("Run ID", RunID) - - #Log top model metrics - mlflow.log_metric("Accuracy", avg_acc[0]) - mlflow.log_metric("AUC", avg_auc[0]) - mlflow.log_metric("Recall", avg_recall[0]) - mlflow.log_metric("Precision", avg_precision[0]) - mlflow.log_metric("F1", avg_f1[0]) - mlflow.log_metric("Kappa", avg_kappa[0]) - mlflow.log_metric("MCC", avg_mcc[0]) - mlflow.log_metric("TT", avg_training_time[0]) - - # Log model and transformation pipeline - logger.info("SubProcess save_model() called ==================================") - save_model(model, 'Trained Model', verbose=False) - logger.info("SubProcess save_model() end ==================================") - mlflow.log_artifact('Trained Model' + '.pkl') - size_bytes = Path('Trained Model.pkl').stat().st_size - size_kb = np.round(size_bytes/1000, 2) - mlflow.set_tag("Size KB", size_kb) - os.remove('Trained Model.pkl') - - score_acc =np.empty((0,0)) - score_auc =np.empty((0,0)) - score_recall =np.empty((0,0)) - score_precision =np.empty((0,0)) - score_f1 =np.empty((0,0)) - score_kappa =np.empty((0,0)) - score_mcc =np.empty((0,0)) - score_training_time =np.empty((0,0)) - - avg_acc = np.empty((0,0)) - avg_auc = np.empty((0,0)) - avg_recall = np.empty((0,0)) - avg_precision = np.empty((0,0)) - avg_f1 = np.empty((0,0)) - avg_kappa = np.empty((0,0)) - avg_mcc = np.empty((0,0)) - avg_training_time = np.empty((0,0)) - - name_counter += 1 - - progress.value += 1 - - def highlight_max(s): - to_highlight = s == s.max() - return ['background-color: yellow' if v else '' for v in to_highlight] - - def highlight_cols(s): - color = 'lightgrey' - return 'background-color: %s' % color - - if y.value_counts().count() > 2: - - compare_models_ = master_display.style.apply(highlight_max,subset=['Accuracy','Recall', - 'Prec.','F1','Kappa', 'MCC']).applymap(highlight_cols, subset = ['TT (Sec)']) - else: - - compare_models_ = master_display.style.apply(highlight_max,subset=['Accuracy','AUC','Recall', - 'Prec.','F1','Kappa', 'MCC']).applymap(highlight_cols, subset = ['TT (Sec)']) - - compare_models_ = compare_models_.set_precision(round) - compare_models_ = compare_models_.set_properties(**{'text-align': 'left'}) - compare_models_ = compare_models_.set_table_styles([dict(selector='th', props=[('text-align', 'left')])]) - - progress.value += 1 - - monitor.iloc[1,1:] = 'Compiling Final Model' - monitor.iloc[3,1:] = 'Almost Finished' - - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - sorted_model_names = list(compare_models_.data['Model']) - if n_select < 0: - sorted_model_names = sorted_model_names[n_select:] - else: - sorted_model_names = sorted_model_names[:n_select] - - model_store_final = [] - - model_fit_start = time.time() - - logger.info("Finalizing top_n models") - - logger.info("SubProcess create_model() called ==================================") - for i in sorted_model_names: - monitor.iloc[2,1:] = i - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - progress.value += 1 - k = model_dict.get(i) - m = create_model(estimator=k, verbose = False, system=False, cross_validation=True) - model_store_final.append(m) - logger.info("SubProcess create_model() end ==================================") - - model_fit_end = time.time() - - model_fit_time = np.array(model_fit_end - model_fit_start).round(2) - - if len(model_store_final) == 1: - model_store_final = model_store_final[0] - - clear_output() - - if verbose: - if html_param: - display(compare_models_) - else: - print(compare_models_.data) - - pd.reset_option("display.max_columns") - - #store in display container - display_container.append(compare_models_.data) - - logger.info("create_model_container: " + str(len(create_model_container))) - logger.info("master_model_container: " + str(len(master_model_container))) - logger.info("display_container: " + str(len(display_container))) - - logger.info(str(model_store_final)) - logger.info("compare_models() succesfully completed......................................") - - return model_store_final - -def tune_model(estimator = None, - fold = 10, - round = 4, - n_iter = 10, - custom_grid = None, #added in pycaret==2.0.0 - optimize = 'Accuracy', - choose_better = False, #added in pycaret==2.0.0 - verbose = True): - - - """ - - Description: - ------------ - This function tunes the hyperparameters of a model and scores it using Stratified - Cross Validation. The output prints a score grid that shows Accuracy, AUC, Recall - Precision, F1, Kappa and MCC by fold (by default = 10 Folds). - - This function returns a trained model object. - - Example - ------- - from pycaret.datasets import get_data - juice = get_data('juice') - experiment_name = setup(data = juice, target = 'Purchase') - xgboost = create_model('xgboost') - - tuned_xgboost = tune_model(xgboost) - - This will tune the hyperparameters of Extreme Gradient Boosting Classifier. - - - Parameters - ---------- - estimator : object, default = None - - fold: integer, default = 10 - Number of folds to be used in Kfold CV. Must be at least 2. - - round: integer, default = 4 - Number of decimal places the metrics in the score grid will be rounded to. - - n_iter: integer, default = 10 - Number of iterations within the Random Grid Search. For every iteration, - the model randomly selects one value from the pre-defined grid of hyperparameters. - - custom_grid: dictionary, default = None - To use custom hyperparameters for tuning pass a dictionary with parameter name - and values to be iterated. When set to None it uses pre-defined tuning grid. - - optimize: string, default = 'accuracy' - Measure used to select the best model through hyperparameter tuning. - The default scoring measure is 'Accuracy'. Other measures include 'AUC', - 'Recall', 'Precision', 'F1'. - - choose_better: Boolean, default = False - When set to set to True, base estimator is returned when the performance doesn't - improve by tune_model. This gurantees the returned object would perform atleast - equivalent to base estimator created using create_model or model returned by - compare_models. - - verbose: Boolean, default = True - Score grid is not printed when verbose is set to False. - - Returns: - -------- - - score grid: A table containing the scores of the model across the kfolds. - ----------- Scoring metrics used are Accuracy, AUC, Recall, Precision, F1, - Kappa and MCC. Mean and standard deviation of the scores across - the folds are also returned. - - model: trained and tuned model object. - ----------- - - Warnings: - --------- - - - If target variable is multiclass (more than 2 classes), optimize param 'AUC' is - not acceptable. - - - If target variable is multiclass (more than 2 classes), AUC will be returned as - zero (0.0) - - - - """ - - ''' - - ERROR HANDLING STARTS HERE - - ''' - import logging - - try: - hasattr(logger, 'name') - except: - logger = logging.getLogger('logs') - logger.setLevel(logging.DEBUG) - - # create console handler and set level to debug - if logger.hasHandlers(): - logger.handlers.clear() - - ch = logging.FileHandler('logs.log') - ch.setLevel(logging.DEBUG) - - # create formatter - formatter = logging.Formatter('%(asctime)s:%(levelname)s:%(message)s') - - # add formatter to ch - ch.setFormatter(formatter) - - # add ch to logger - logger.addHandler(ch) - - logger.info("Initializing tune_model()") - logger.info("""tune_model(estimator={}, fold={}, round={}, n_iter={}, custom_grid={}, optimize={}, choose_better={}, verbose={})""".\ - format(str(estimator), str(fold), str(round), str(n_iter), str(custom_grid), str(optimize), str(choose_better), str(verbose))) - - logger.info("Checking exceptions") - - #exception checking - import sys - - #run_time - import datetime, time - runtime_start = time.time() - - #checking estimator if string - if type(estimator) is str: - sys.exit('(Type Error): The behavior of tune_model in version 1.0.1 is changed. Please pass trained model object.') - - #restrict VotingClassifier - if hasattr(estimator,'voting'): - sys.exit('(Type Error): VotingClassifier not allowed under tune_model().') - - #checking fold parameter - if type(fold) is not int: - sys.exit('(Type Error): Fold parameter only accepts integer value.') - - #checking round parameter - if type(round) is not int: - sys.exit('(Type Error): Round parameter only accepts integer value.') - - #checking n_iter parameter - if type(n_iter) is not int: - sys.exit('(Type Error): n_iter parameter only accepts integer value.') - - #checking optimize parameter - allowed_optimize = ['Accuracy', 'Recall', 'Precision', 'F1', 'AUC', 'MCC'] - if optimize not in allowed_optimize: - sys.exit('(Value Error): Optimization method not supported. See docstring for list of available parameters.') - - #checking optimize parameter for multiclass - if y.value_counts().count() > 2: - if optimize == 'AUC': - sys.exit('(Type Error): AUC metric not supported for multiclass problems. See docstring for list of other optimization parameters.') - - if type(n_iter) is not int: - sys.exit('(Type Error): n_iter parameter only accepts integer value.') - - #checking verbose parameter - if type(verbose) is not bool: - sys.exit('(Type Error): Verbose parameter can only take argument as True or False.') - - - ''' - - ERROR HANDLING ENDS HERE - - ''' - - logger.info("Preloading libraries") - - #pre-load libraries - import pandas as pd - import ipywidgets as ipw - from IPython.display import display, HTML, clear_output, update_display - - logger.info("Preparing display monitor") - - #progress bar - progress = ipw.IntProgress(value=0, min=0, max=fold+6, step=1 , description='Processing: ') - master_display = pd.DataFrame(columns=['Accuracy','AUC','Recall', 'Prec.', 'F1', 'Kappa', 'MCC']) - if verbose: - if html_param: - display(progress) - - #display monitor - timestampStr = datetime.datetime.now().strftime("%H:%M:%S") - monitor = pd.DataFrame( [ ['Initiated' , '. . . . . . . . . . . . . . . . . .', timestampStr ], - ['Status' , '. . . . . . . . . . . . . . . . . .' , 'Loading Dependencies' ], - ['ETC' , '. . . . . . . . . . . . . . . . . .', 'Calculating ETC'] ], - columns=['', ' ', ' ']).set_index('') - - if verbose: - if html_param: - display(monitor, display_id = 'monitor') - display_ = display(master_display, display_id=True) - display_id = display_.display_id - - #ignore warnings - import warnings - warnings.filterwarnings('ignore') - - #ignore warnings - import warnings - warnings.filterwarnings('ignore') - - logger.info("Copying training dataset") - #Storing X_train and y_train in data_X and data_y parameter - data_X = X_train.copy() - data_y = y_train.copy() - - #reset index - data_X.reset_index(drop=True, inplace=True) - data_y.reset_index(drop=True, inplace=True) - - progress.value += 1 - - logger.info("Importing libraries") - #general dependencies - import random - import numpy as np - from sklearn import metrics - from sklearn.model_selection import StratifiedKFold - from sklearn.model_selection import RandomizedSearchCV - - #setting numpy seed - np.random.seed(seed) - - #setting optimize parameter - if optimize == 'Accuracy': - optimize = 'accuracy' - compare_dimension = 'Accuracy' - - elif optimize == 'AUC': - optimize = 'roc_auc' - compare_dimension = 'AUC' - - elif optimize == 'Recall': - if y.value_counts().count() > 2: - optimize = metrics.make_scorer(metrics.recall_score, average = 'macro') - else: - optimize = 'recall' - compare_dimension = 'Recall' - - elif optimize == 'Precision': - if y.value_counts().count() > 2: - optimize = metrics.make_scorer(metrics.precision_score, average = 'weighted') - else: - optimize = 'precision' - compare_dimension = 'Prec.' - - elif optimize == 'F1': - if y.value_counts().count() > 2: - optimize = metrics.make_scorer(metrics.f1_score, average = 'weighted') - else: - optimize = optimize = 'f1' - compare_dimension = 'F1' - - elif optimize == 'MCC': - optimize = 'roc_auc' # roc_auc instead because you cannot use MCC in gridsearchcv - compare_dimension = 'MCC' - - - #convert trained estimator into string name for grids - - logger.info("Checking base model") - def get_model_name(e): - return str(e).split("(")[0] - - if len(estimator.classes_) > 2: - mn = get_model_name(estimator.estimator) - else: - mn = get_model_name(estimator) - - if 'catboost' in mn: - mn = 'CatBoostClassifier' - - model_dict = {'ExtraTreesClassifier' : 'et', - 'GradientBoostingClassifier' : 'gbc', - 'RandomForestClassifier' : 'rf', - 'LGBMClassifier' : 'lightgbm', - 'XGBClassifier' : 'xgboost', - 'AdaBoostClassifier' : 'ada', - 'DecisionTreeClassifier' : 'dt', - 'RidgeClassifier' : 'ridge', - 'LogisticRegression' : 'lr', - 'KNeighborsClassifier' : 'knn', - 'GaussianNB' : 'nb', - 'SGDClassifier' : 'svm', - 'SVC' : 'rbfsvm', - 'GaussianProcessClassifier' : 'gpc', - 'MLPClassifier' : 'mlp', - 'QuadraticDiscriminantAnalysis' : 'qda', - 'LinearDiscriminantAnalysis' : 'lda', - 'CatBoostClassifier' : 'catboost', - 'BaggingClassifier' : 'Bagging'} - - model_dict_logging = {'ExtraTreesClassifier' : 'Extra Trees Classifier', - 'GradientBoostingClassifier' : 'Gradient Boosting Classifier', - 'RandomForestClassifier' : 'Random Forest Classifier', - 'LGBMClassifier' : 'Light Gradient Boosting Machine', - 'XGBClassifier' : 'Extreme Gradient Boosting', - 'AdaBoostClassifier' : 'Ada Boost Classifier', - 'DecisionTreeClassifier' : 'Decision Tree Classifier', - 'RidgeClassifier' : 'Ridge Classifier', - 'LogisticRegression' : 'Logistic Regression', - 'KNeighborsClassifier' : 'K Neighbors Classifier', - 'GaussianNB' : 'Naive Bayes', - 'SGDClassifier' : 'SVM - Linear Kernel', - 'SVC' : 'SVM - Radial Kernel', - 'GaussianProcessClassifier' : 'Gaussian Process Classifier', - 'MLPClassifier' : 'MLP Classifier', - 'QuadraticDiscriminantAnalysis' : 'Quadratic Discriminant Analysis', - 'LinearDiscriminantAnalysis' : 'Linear Discriminant Analysis', - 'CatBoostClassifier' : 'CatBoost Classifier', - 'BaggingClassifier' : 'Bagging Classifier', - 'VotingClassifier' : 'Voting Classifier'} - - _estimator_ = estimator - - estimator = model_dict.get(mn) - - logger.info('Base model : ' + str(model_dict_logging.get(mn))) - - progress.value += 1 - - logger.info("Defining folds") - kf = StratifiedKFold(fold, random_state=seed, shuffle=folds_shuffle_param) - - logger.info("Declaring metric variables") - score_auc =np.empty((0,0)) - score_acc =np.empty((0,0)) - score_recall =np.empty((0,0)) - score_precision =np.empty((0,0)) - score_f1 =np.empty((0,0)) - score_kappa =np.empty((0,0)) - score_mcc=np.empty((0,0)) - score_training_time=np.empty((0,0)) - avgs_auc =np.empty((0,0)) - avgs_acc =np.empty((0,0)) - avgs_recall =np.empty((0,0)) - avgs_precision =np.empty((0,0)) - avgs_f1 =np.empty((0,0)) - avgs_kappa =np.empty((0,0)) - avgs_mcc=np.empty((0,0)) - avgs_training_time=np.empty((0,0)) - - - ''' - MONITOR UPDATE STARTS - ''' - - monitor.iloc[1,1:] = 'Searching Hyperparameters' - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - ''' - MONITOR UPDATE ENDS - ''' - - logger.info("Defining Hyperparameters") - logger.info("Initializing RandomizedSearchCV") - - #setting turbo parameters - cv = 3 - - if estimator == 'knn': - - from sklearn.neighbors import KNeighborsClassifier - - if custom_grid is not None: - param_grid = custom_grid - else: - param_grid = {'n_neighbors': range(1,51), - 'weights' : ['uniform', 'distance'], - 'metric':["euclidean", "manhattan"] - } - - model_grid = RandomizedSearchCV(estimator=KNeighborsClassifier(n_jobs=n_jobs_param), param_distributions=param_grid, - scoring=optimize, n_iter=n_iter, cv=cv, random_state=seed, - n_jobs=n_jobs_param, iid=False) - - model_grid.fit(X_train,y_train) - model = model_grid.best_estimator_ - best_model = model_grid.best_estimator_ - best_model_param = model_grid.best_params_ - - elif estimator == 'lr': - - from sklearn.linear_model import LogisticRegression - - if custom_grid is not None: - param_grid = custom_grid - else: - param_grid = {'C': np.arange(0, 10, 0.001), - "penalty": [ 'l1', 'l2'], - "class_weight": ["balanced", None] - } - model_grid = RandomizedSearchCV(estimator=LogisticRegression(random_state=seed, n_jobs=n_jobs_param), - param_distributions=param_grid, scoring=optimize, n_iter=n_iter, cv=cv, - random_state=seed, iid=False, n_jobs=n_jobs_param) - model_grid.fit(X_train,y_train) - model = model_grid.best_estimator_ - best_model = model_grid.best_estimator_ - best_model_param = model_grid.best_params_ - - elif estimator == 'dt': - - from sklearn.tree import DecisionTreeClassifier - - if custom_grid is not None: - param_grid = custom_grid - else: - param_grid = {"max_depth": np.random.randint(1, (len(X_train.columns)*.85),20), - "max_features": np.random.randint(1, len(X_train.columns),20), - "min_samples_leaf": [2,3,4,5,6], - "criterion": ["gini", "entropy"], - } - - model_grid = RandomizedSearchCV(estimator=DecisionTreeClassifier(random_state=seed), param_distributions=param_grid, - scoring=optimize, n_iter=n_iter, cv=cv, random_state=seed, - iid=False, n_jobs=n_jobs_param) - - model_grid.fit(X_train,y_train) - model = model_grid.best_estimator_ - best_model = model_grid.best_estimator_ - best_model_param = model_grid.best_params_ - - elif estimator == 'mlp': - - from sklearn.neural_network import MLPClassifier - - if custom_grid is not None: - param_grid = custom_grid - else: - param_grid = {'learning_rate': ['constant', 'invscaling', 'adaptive'], - 'solver' : ['lbfgs', 'sgd', 'adam'], - 'alpha': np.arange(0, 1, 0.0001), - 'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,), (100,50,100), (100,100,100)], - 'activation': ["tanh", "identity", "logistic","relu"] - } - - model_grid = RandomizedSearchCV(estimator=MLPClassifier(max_iter=1000, random_state=seed), - param_distributions=param_grid, scoring=optimize, n_iter=n_iter, cv=cv, - random_state=seed, iid=False, n_jobs=n_jobs_param) - - model_grid.fit(X_train,y_train) - model = model_grid.best_estimator_ - best_model = model_grid.best_estimator_ - best_model_param = model_grid.best_params_ - - elif estimator == 'gpc': - - from sklearn.gaussian_process import GaussianProcessClassifier - - if custom_grid is not None: - param_grid = custom_grid - else: - param_grid = {"max_iter_predict":[100,200,300,400,500,600,700,800,900,1000]} - - model_grid = RandomizedSearchCV(estimator=GaussianProcessClassifier(random_state=seed, n_jobs=n_jobs_param), param_distributions=param_grid, - scoring=optimize, n_iter=n_iter, cv=cv, random_state=seed, - n_jobs=n_jobs_param) - - model_grid.fit(X_train,y_train) - model = model_grid.best_estimator_ - best_model = model_grid.best_estimator_ - best_model_param = model_grid.best_params_ - - elif estimator == 'rbfsvm': - - from sklearn.svm import SVC - - if custom_grid is not None: - param_grid = custom_grid - else: - param_grid = {'C': np.arange(0, 50, 0.01), - "class_weight": ["balanced", None]} - - model_grid = RandomizedSearchCV(estimator=SVC(gamma='auto', C=1, probability=True, kernel='rbf', random_state=seed), - param_distributions=param_grid, scoring=optimize, n_iter=n_iter, - cv=cv, random_state=seed, n_jobs=n_jobs_param) - - model_grid.fit(X_train,y_train) - model = model_grid.best_estimator_ - best_model = model_grid.best_estimator_ - best_model_param = model_grid.best_params_ - - elif estimator == 'nb': - - from sklearn.naive_bayes import GaussianNB - - if custom_grid is not None: - param_grid = custom_grid - else: - param_grid = {'var_smoothing': [0.000000001, 0.000000002, 0.000000005, 0.000000008, 0.000000009, - 0.0000001, 0.0000002, 0.0000003, 0.0000005, 0.0000007, 0.0000009, - 0.00001, 0.001, 0.002, 0.003, 0.004, 0.005, 0.007, 0.009, - 0.004, 0.005, 0.006, 0.007,0.008, 0.009, 0.01, 0.1, 1] - } - - model_grid = RandomizedSearchCV(estimator=GaussianNB(), - param_distributions=param_grid, scoring=optimize, n_iter=n_iter, - cv=cv, random_state=seed, n_jobs=n_jobs_param) - - model_grid.fit(X_train,y_train) - model = model_grid.best_estimator_ - best_model = model_grid.best_estimator_ - best_model_param = model_grid.best_params_ - - elif estimator == 'svm': - - from sklearn.linear_model import SGDClassifier - - if custom_grid is not None: - param_grid = custom_grid - else: - param_grid = {'penalty': ['l2', 'l1','elasticnet'], - 'l1_ratio': np.arange(0,1,0.01), - 'alpha': [0.0001, 0.001, 0.01, 0.0002, 0.002, 0.02, 0.0005, 0.005, 0.05], - 'fit_intercept': [True, False], - 'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'], - 'eta0': [0.001, 0.01,0.05,0.1,0.2,0.3,0.4,0.5] - } - - model_grid = RandomizedSearchCV(estimator=SGDClassifier(loss='hinge', random_state=seed, n_jobs=n_jobs_param), - param_distributions=param_grid, scoring=optimize, n_iter=n_iter, - cv=cv, random_state=seed, n_jobs=n_jobs_param) - - model_grid.fit(X_train,y_train) - model = model_grid.best_estimator_ - best_model = model_grid.best_estimator_ - best_model_param = model_grid.best_params_ - - elif estimator == 'ridge': - - from sklearn.linear_model import RidgeClassifier - - if custom_grid is not None: - param_grid = custom_grid - else: - param_grid = {'alpha': np.arange(0,1,0.001), - 'fit_intercept': [True, False], - 'normalize': [True, False] - } - - model_grid = RandomizedSearchCV(estimator=RidgeClassifier(random_state=seed), - param_distributions=param_grid, scoring=optimize, n_iter=n_iter, - cv=cv, random_state=seed, n_jobs=n_jobs_param) - - model_grid.fit(X_train,y_train) - model = model_grid.best_estimator_ - best_model = model_grid.best_estimator_ - best_model_param = model_grid.best_params_ - - elif estimator == 'rf': - - from sklearn.ensemble import RandomForestClassifier - - if custom_grid is not None: - param_grid = custom_grid - else: - param_grid = {'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100], - 'criterion': ['gini', 'entropy'], - 'max_depth': [int(x) for x in np.linspace(10, 110, num = 11)], - 'min_samples_split': [2, 5, 7, 9, 10], - 'min_samples_leaf' : [1, 2, 4], - 'max_features' : ['auto', 'sqrt', 'log2'], - 'bootstrap': [True, False] - } - - model_grid = RandomizedSearchCV(estimator=RandomForestClassifier(random_state=seed, n_jobs=n_jobs_param), - param_distributions=param_grid, scoring=optimize, n_iter=n_iter, - cv=cv, random_state=seed, n_jobs=n_jobs_param) - - model_grid.fit(X_train,y_train) - model = model_grid.best_estimator_ - best_model = model_grid.best_estimator_ - best_model_param = model_grid.best_params_ - - elif estimator == 'ada': - - from sklearn.ensemble import AdaBoostClassifier - - if custom_grid is not None: - param_grid = custom_grid - else: - param_grid = {'n_estimators': np.arange(10,200,5), - 'learning_rate': np.arange(0,1,0.01), - 'algorithm' : ["SAMME", "SAMME.R"] - } - - if y.value_counts().count() > 2: - base_estimator_input = _estimator_.estimator.base_estimator - else: - base_estimator_input = _estimator_.base_estimator - - model_grid = RandomizedSearchCV(estimator=AdaBoostClassifier(base_estimator = base_estimator_input, random_state=seed), - param_distributions=param_grid, scoring=optimize, n_iter=n_iter, - cv=cv, random_state=seed, n_jobs=n_jobs_param) - - model_grid.fit(X_train,y_train) - model = model_grid.best_estimator_ - best_model = model_grid.best_estimator_ - best_model_param = model_grid.best_params_ - - elif estimator == 'gbc': - - from sklearn.ensemble import GradientBoostingClassifier - - if custom_grid is not None: - param_grid = custom_grid - else: - param_grid = {'n_estimators': np.arange(10,200,5), - 'learning_rate': np.arange(0,1,0.01), - 'subsample' : np.arange(0.1,1,0.05), - 'min_samples_split' : [2,4,5,7,9,10], - 'min_samples_leaf' : [1,2,3,4,5], - 'max_depth': [int(x) for x in np.linspace(10, 110, num = 11)], - 'max_features' : ['auto', 'sqrt', 'log2'] - } - - model_grid = RandomizedSearchCV(estimator=GradientBoostingClassifier(random_state=seed), - param_distributions=param_grid, scoring=optimize, n_iter=n_iter, - cv=cv, random_state=seed, n_jobs=n_jobs_param) - - model_grid.fit(X_train,y_train) - model = model_grid.best_estimator_ - best_model = model_grid.best_estimator_ - best_model_param = model_grid.best_params_ - - elif estimator == 'qda': - - from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis - - if custom_grid is not None: - param_grid = custom_grid - else: - param_grid = {'reg_param': np.arange(0,1,0.01)} - - model_grid = RandomizedSearchCV(estimator=QuadraticDiscriminantAnalysis(), - param_distributions=param_grid, scoring=optimize, n_iter=n_iter, - cv=cv, random_state=seed, n_jobs=n_jobs_param) - - model_grid.fit(X_train,y_train) - model = model_grid.best_estimator_ - best_model = model_grid.best_estimator_ - best_model_param = model_grid.best_params_ - - elif estimator == 'lda': - - from sklearn.discriminant_analysis import LinearDiscriminantAnalysis - - if custom_grid is not None: - param_grid = custom_grid - else: - param_grid = {'solver' : ['lsqr', 'eigen'], - 'shrinkage': [None, 0.0001, 0.001, 0.01, 0.0005, 0.005, 0.05, 0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1] - } - - model_grid = RandomizedSearchCV(estimator=LinearDiscriminantAnalysis(), - param_distributions=param_grid, scoring=optimize, n_iter=n_iter, - cv=cv, random_state=seed, n_jobs=n_jobs_param) - - model_grid.fit(X_train,y_train) - model = model_grid.best_estimator_ - best_model = model_grid.best_estimator_ - best_model_param = model_grid.best_params_ - - elif estimator == 'et': - - from sklearn.ensemble import ExtraTreesClassifier - - if custom_grid is not None: - param_grid = custom_grid - else: - param_grid = {'n_estimators': np.arange(10,200,5), - 'criterion': ['gini', 'entropy'], - 'max_depth': [int(x) for x in np.linspace(10, 110, num = 11)], - 'min_samples_split': [2, 5, 7, 9, 10], - 'min_samples_leaf' : [1, 2, 4], - 'max_features' : ['auto', 'sqrt', 'log2'], - 'bootstrap': [True, False] - } - - model_grid = RandomizedSearchCV(estimator=ExtraTreesClassifier(random_state=seed, n_jobs=n_jobs_param), - param_distributions=param_grid, scoring=optimize, n_iter=n_iter, - cv=cv, random_state=seed, n_jobs=n_jobs_param) - - model_grid.fit(X_train,y_train) - model = model_grid.best_estimator_ - best_model = model_grid.best_estimator_ - best_model_param = model_grid.best_params_ - - - elif estimator == 'xgboost': - - from xgboost import XGBClassifier - - num_class = y.value_counts().count() - - if custom_grid is not None: - param_grid = custom_grid - - elif y.value_counts().count() > 2: - - param_grid = {'learning_rate': np.arange(0,1,0.01), - 'n_estimators': np.arange(10,500,20), - 'subsample': [0.1, 0.2, 0.3, 0.5, 0.7, 0.9, 1], - 'max_depth': [int(x) for x in np.linspace(10, 110, num = 11)], - 'colsample_bytree': [0.5, 0.7, 0.9, 1], - 'min_child_weight': [1, 2, 3, 4], - 'num_class' : [num_class, num_class] - } - else: - param_grid = {'learning_rate': np.arange(0,1,0.01), - 'n_estimators':[10, 30, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000], - 'subsample': [0.1, 0.2, 0.3, 0.5, 0.7, 0.9, 1], - 'max_depth': [int(x) for x in np.linspace(10, 110, num = 11)], - 'colsample_bytree': [0.5, 0.7, 0.9, 1], - 'min_child_weight': [1, 2, 3, 4], - } - - model_grid = RandomizedSearchCV(estimator=XGBClassifier(random_state=seed, n_jobs=n_jobs_param, verbosity=0), - param_distributions=param_grid, scoring=optimize, n_iter=n_iter, - cv=cv, random_state=seed, n_jobs=n_jobs_param) - - model_grid.fit(X_train,y_train) - model = model_grid.best_estimator_ - best_model = model_grid.best_estimator_ - best_model_param = model_grid.best_params_ - - - elif estimator == 'lightgbm': - - import lightgbm as lgb - - if custom_grid is not None: - param_grid = custom_grid - else: - param_grid = {'num_leaves': [10,20,30,40,50,60,70,80,90,100,150,200], - 'max_depth': [int(x) for x in np.linspace(10, 110, num = 11)], - 'learning_rate': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1], - 'n_estimators': [10, 30, 50, 70, 90, 100, 120, 150, 170, 200], - 'min_split_gain' : [0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9], - 'reg_alpha': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], - 'reg_lambda': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] - } - - model_grid = RandomizedSearchCV(estimator=lgb.LGBMClassifier(random_state=seed, n_jobs=n_jobs_param), - param_distributions=param_grid, scoring=optimize, n_iter=n_iter, - cv=cv, random_state=seed, n_jobs=n_jobs_param) - - model_grid.fit(X_train,y_train) - model = model_grid.best_estimator_ - best_model = model_grid.best_estimator_ - best_model_param = model_grid.best_params_ - - - elif estimator == 'catboost': - - from catboost import CatBoostClassifier - - if custom_grid is not None: - param_grid = custom_grid - else: - param_grid = {'depth':[3,1,2,6,4,5,7,8,9,10], - 'iterations':[250,100,500,1000], - 'learning_rate':[0.03,0.001,0.01,0.1,0.2,0.3], - 'l2_leaf_reg':[3,1,5,10,100], - 'border_count':[32,5,10,20,50,100,200], - } - - model_grid = RandomizedSearchCV(estimator=CatBoostClassifier(random_state=seed, silent=True, thread_count=n_jobs_param), - param_distributions=param_grid, scoring=optimize, n_iter=n_iter, - cv=cv, random_state=seed, n_jobs=n_jobs_param) - - model_grid.fit(X_train,y_train) - model = model_grid.best_estimator_ - best_model = model_grid.best_estimator_ - best_model_param = model_grid.best_params_ - - elif estimator == 'Bagging': - - from sklearn.ensemble import BaggingClassifier - - if custom_grid is not None: - param_grid = custom_grid - - else: - param_grid = {'n_estimators': np.arange(10,300,10), - 'bootstrap': [True, False], - 'bootstrap_features': [True, False], - } - - model_grid = RandomizedSearchCV(estimator=BaggingClassifier(base_estimator=_estimator_.base_estimator, random_state=seed, n_jobs=n_jobs_param), - param_distributions=param_grid, scoring=optimize, n_iter=n_iter, - cv=cv, random_state=seed, n_jobs=n_jobs_param) - - model_grid.fit(X_train,y_train) - model = model_grid.best_estimator_ - best_model = model_grid.best_estimator_ - best_model_param = model_grid.best_params_ - - progress.value += 1 - progress.value += 1 - progress.value += 1 - - logger.info("Random search completed") - - #multiclass checking - if y.value_counts().count() > 2: - from sklearn.multiclass import OneVsRestClassifier - model = OneVsRestClassifier(model) - best_model = model - - ''' - MONITOR UPDATE STARTS - ''' - - monitor.iloc[1,1:] = 'Initializing CV' - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - ''' - MONITOR UPDATE ENDS - ''' - - fold_num = 1 - - for train_i , test_i in kf.split(data_X,data_y): - - logger.info("Initializing Fold " + str(fold_num)) - - t0 = time.time() - - - ''' - MONITOR UPDATE STARTS - ''' - - monitor.iloc[1,1:] = 'Fitting Fold ' + str(fold_num) + ' of ' + str(fold) - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - ''' - MONITOR UPDATE ENDS - ''' - - Xtrain,Xtest = data_X.iloc[train_i], data_X.iloc[test_i] - ytrain,ytest = data_y.iloc[train_i], data_y.iloc[test_i] - time_start=time.time() - - if fix_imbalance_param: - - logger.info("Initializing SMOTE") - - if fix_imbalance_method_param is None: - from imblearn.over_sampling import SMOTE - resampler = SMOTE(random_state = seed) - else: - resampler = fix_imbalance_method_param - - Xtrain,ytrain = resampler.fit_sample(Xtrain, ytrain) - logger.info("Resampling completed") - - if hasattr(model, 'predict_proba'): - logger.info("Fitting Model") - model.fit(Xtrain,ytrain) - logger.info("Evaluating Metrics") - pred_prob = model.predict_proba(Xtest) - pred_prob = pred_prob[:,1] - pred_ = model.predict(Xtest) - sca = metrics.accuracy_score(ytest,pred_) - - if y.value_counts().count() > 2: - sc = 0 - recall = metrics.recall_score(ytest,pred_, average='macro') - precision = metrics.precision_score(ytest,pred_, average = 'weighted') - f1 = metrics.f1_score(ytest,pred_, average='weighted') - - else: - try: - sc = metrics.roc_auc_score(ytest,pred_prob) - except: - sc = 0 - logger.warning("model has no predict_proba attribute. AUC set to 0.00") - recall = metrics.recall_score(ytest,pred_) - precision = metrics.precision_score(ytest,pred_) - f1 = metrics.f1_score(ytest,pred_) - - else: - logger.info("Fitting Model") - model.fit(Xtrain,ytrain) - logger.info("Evaluating Metrics") - pred_prob = 0.00 - logger.warning("model has no predict_proba attribute. pred_prob set to 0.00") - pred_ = model.predict(Xtest) - sca = metrics.accuracy_score(ytest,pred_) - - if y.value_counts().count() > 2: - sc = 0 - recall = metrics.recall_score(ytest,pred_, average='macro') - precision = metrics.precision_score(ytest,pred_, average = 'weighted') - f1 = metrics.f1_score(ytest,pred_, average='weighted') - - else: - try: - sc = metrics.roc_auc_score(ytest,pred_prob) - except: - sc = 0 - logger.warning("model has no predict_proba attribute. AUC set to 0.00") - recall = metrics.recall_score(ytest,pred_) - precision = metrics.precision_score(ytest,pred_) - f1 = metrics.f1_score(ytest,pred_) - - logger.info("Compiling Metrics") - time_end=time.time() - kappa = metrics.cohen_kappa_score(ytest,pred_) - mcc = metrics.matthews_corrcoef(ytest,pred_) - training_time=time_end-time_start - score_acc = np.append(score_acc,sca) - score_auc = np.append(score_auc,sc) - score_recall = np.append(score_recall,recall) - score_precision = np.append(score_precision,precision) - score_f1 =np.append(score_f1,f1) - score_kappa =np.append(score_kappa,kappa) - score_mcc=np.append(score_mcc,mcc) - score_training_time=np.append(score_training_time,training_time) - - progress.value += 1 - - - ''' - - This section is created to update_display() as code loops through the fold defined. - - ''' - - fold_results = pd.DataFrame({'Accuracy':[sca], 'AUC': [sc], 'Recall': [recall], - 'Prec.': [precision], 'F1': [f1], 'Kappa': [kappa], 'MCC':[mcc]}).round(round) - master_display = pd.concat([master_display, fold_results],ignore_index=True) - fold_results = [] - - ''' - - TIME CALCULATION SUB-SECTION STARTS HERE - - ''' - - t1 = time.time() - - tt = (t1 - t0) * (fold-fold_num) / 60 - tt = np.around(tt, 2) - - if tt < 1: - tt = str(np.around((tt * 60), 2)) - ETC = tt + ' Seconds Remaining' - - else: - tt = str (tt) - ETC = tt + ' Minutes Remaining' - - if verbose: - if html_param: - update_display(ETC, display_id = 'ETC') - - fold_num += 1 - - ''' - MONITOR UPDATE STARTS - ''' - - monitor.iloc[2,1:] = ETC - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - ''' - MONITOR UPDATE ENDS - ''' - - ''' - - TIME CALCULATION ENDS HERE - - ''' - - if verbose: - if html_param: - update_display(master_display, display_id = display_id) - - ''' - - Update_display() ends here - - ''' - - progress.value += 1 - - logger.info("Calculating mean and std") - mean_acc=np.mean(score_acc) - mean_auc=np.mean(score_auc) - mean_recall=np.mean(score_recall) - mean_precision=np.mean(score_precision) - mean_f1=np.mean(score_f1) - mean_kappa=np.mean(score_kappa) - mean_mcc=np.mean(score_mcc) - mean_training_time=np.sum(score_training_time) - std_acc=np.std(score_acc) - std_auc=np.std(score_auc) - std_recall=np.std(score_recall) - std_precision=np.std(score_precision) - std_f1=np.std(score_f1) - std_kappa=np.std(score_kappa) - std_mcc=np.std(score_mcc) - std_training_time=np.std(score_training_time) - - avgs_acc = np.append(avgs_acc, mean_acc) - avgs_acc = np.append(avgs_acc, std_acc) - avgs_auc = np.append(avgs_auc, mean_auc) - avgs_auc = np.append(avgs_auc, std_auc) - avgs_recall = np.append(avgs_recall, mean_recall) - avgs_recall = np.append(avgs_recall, std_recall) - avgs_precision = np.append(avgs_precision, mean_precision) - avgs_precision = np.append(avgs_precision, std_precision) - avgs_f1 = np.append(avgs_f1, mean_f1) - avgs_f1 = np.append(avgs_f1, std_f1) - avgs_kappa = np.append(avgs_kappa, mean_kappa) - avgs_kappa = np.append(avgs_kappa, std_kappa) - - avgs_mcc = np.append(avgs_mcc, mean_mcc) - avgs_mcc = np.append(avgs_mcc, std_mcc) - avgs_training_time = np.append(avgs_training_time, mean_training_time) - avgs_training_time = np.append(avgs_training_time, std_training_time) - - progress.value += 1 - - logger.info("Creating metrics dataframe") - model_results = pd.DataFrame({'Accuracy': score_acc, 'AUC': score_auc, 'Recall' : score_recall, 'Prec.' : score_precision , - 'F1' : score_f1, 'Kappa' : score_kappa, 'MCC':score_mcc}) - model_avgs = pd.DataFrame({'Accuracy': avgs_acc, 'AUC': avgs_auc, 'Recall' : avgs_recall, 'Prec.' : avgs_precision , - 'F1' : avgs_f1, 'Kappa' : avgs_kappa, 'MCC':avgs_mcc},index=['Mean', 'SD']) - - model_results = model_results.append(model_avgs) - model_results = model_results.round(round) - - # yellow the mean - model_results=model_results.style.apply(lambda x: ['background: yellow' if (x.name == 'Mean') else '' for i in x], axis=1) - model_results = model_results.set_precision(round) - - progress.value += 1 - - #refitting the model on complete X_train, y_train - monitor.iloc[1,1:] = 'Finalizing Model' - monitor.iloc[2,1:] = 'Almost Finished' - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - model_fit_start = time.time() - logger.info("Finalizing model") - best_model.fit(data_X, data_y) - model_fit_end = time.time() - - model_fit_time = np.array(model_fit_end - model_fit_start).round(2) - - progress.value += 1 - - #storing results in create_model_container - logger.info("Uploading results into container") - create_model_container.append(model_results.data) - display_container.append(model_results.data) - - #storing results in master_model_container - logger.info("Uploading model into container") - master_model_container.append(best_model) - - ''' - When choose_better sets to True. optimize metric in scoregrid is - compared with base model created using create_model so that tune_model - functions return the model with better score only. This will ensure - model performance is atleast equivalent to what is seen is compare_models - ''' - if choose_better: - logger.info("choose_better activated") - if verbose: - if html_param: - monitor.iloc[1,1:] = 'Compiling Final Results' - monitor.iloc[2,1:] = 'Almost Finished' - update_display(monitor, display_id = 'monitor') - - #creating base model for comparison - logger.info("SubProcess create_model() called ==================================") - if estimator in ['Bagging', 'ada']: - base_model = create_model(estimator=_estimator_, verbose = False, system=False) - else: - base_model = create_model(estimator=estimator, verbose = False, system=False) - logger.info("SubProcess create_model() called ==================================") - base_model_results = create_model_container[-1][compare_dimension][-2:][0] - tuned_model_results = create_model_container[-2][compare_dimension][-2:][0] - - if tuned_model_results > base_model_results: - best_model = best_model - else: - best_model = base_model - - #re-instate display_constainer state - display_container.pop(-1) - logger.info("choose_better completed") - - #end runtime - runtime_end = time.time() - runtime = np.array(runtime_end - runtime_start).round(2) - - #mlflow logging - if logging_param: - - logger.info("Creating MLFlow logs") - - #Creating Logs message monitor - monitor.iloc[1,1:] = 'Creating Logs' - monitor.iloc[2,1:] = 'Almost Finished' - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - import mlflow - from pathlib import Path - import os - - mlflow.set_experiment(exp_name_log) - full_name = model_dict_logging.get(mn) - - with mlflow.start_run(run_name=full_name) as run: - - # Get active run to log as tag - RunID = mlflow.active_run().info.run_id - - params = best_model.get_params() - - # Log model parameters - params = model.get_params() - - for i in list(params): - v = params.get(i) - if len(str(v)) > 250: - params.pop(i) - - mlflow.log_params(params) - - mlflow.log_metrics({"Accuracy": avgs_acc[0], "AUC": avgs_auc[0], "Recall": avgs_recall[0], "Precision" : avgs_precision[0], - "F1": avgs_f1[0], "Kappa": avgs_kappa[0], "MCC": avgs_mcc[0]}) - - #set tag of compare_models - mlflow.set_tag("Source", "tune_model") - - import secrets - URI = secrets.token_hex(nbytes=4) - mlflow.set_tag("URI", URI) - mlflow.set_tag("USI", USI) - mlflow.set_tag("Run Time", runtime) - mlflow.set_tag("Run ID", RunID) - - # Log training time in seconds - mlflow.log_metric("TT", model_fit_time) - - # Log model and transformation pipeline - logger.info("SubProcess save_model() called ==================================") - save_model(best_model, 'Trained Model', verbose=False) - logger.info("SubProcess save_model() end ==================================") - mlflow.log_artifact('Trained Model' + '.pkl') - size_bytes = Path('Trained Model.pkl').stat().st_size - size_kb = np.round(size_bytes/1000, 2) - mlflow.set_tag("Size KB", size_kb) - os.remove('Trained Model.pkl') - - # Log the CV results as model_results.html artifact - model_results.data.to_html('Results.html', col_space=65, justify='left') - mlflow.log_artifact('Results.html') - os.remove('Results.html') - - # Generate hold-out predictions and save as html - holdout = predict_model(best_model, verbose=False) - holdout_score = pull() - display_container.pop(-1) - holdout_score.to_html('Holdout.html', col_space=65, justify='left') - mlflow.log_artifact('Holdout.html') - os.remove('Holdout.html') - - # Log AUC and Confusion Matrix plot - if log_plots_param: - - logger.info("SubProcess plot_model() called ==================================") - - try: - plot_model(model, plot = 'auc', verbose=False, save=True, system=False) - mlflow.log_artifact('AUC.png') - os.remove("AUC.png") - except: - pass - - try: - plot_model(model, plot = 'confusion_matrix', verbose=False, save=True, system=False) - mlflow.log_artifact('Confusion Matrix.png') - os.remove("Confusion Matrix.png") - except: - pass - - try: - plot_model(model, plot = 'feature', verbose=False, save=True, system=False) - mlflow.log_artifact('Feature Importance.png') - os.remove("Feature Importance.png") - except: - pass - - logger.info("SubProcess plot_model() end ==================================") - - # Log hyperparameter tuning grid - d1 = model_grid.cv_results_.get('params') - dd = pd.DataFrame.from_dict(d1) - dd['Score'] = model_grid.cv_results_.get('mean_test_score') - dd.to_html('Iterations.html', col_space=75, justify='left') - mlflow.log_artifact('Iterations.html') - os.remove('Iterations.html') - - if verbose: - clear_output() - if html_param: - display(model_results) - else: - print(model_results.data) - - logger.info("create_model_container: " + str(len(create_model_container))) - logger.info("master_model_container: " + str(len(master_model_container))) - logger.info("display_container: " + str(len(display_container))) - - logger.info(str(best_model)) - logger.info("tune_model() succesfully completed......................................") - - return best_model - -def blend_models(estimator_list = 'All', - fold = 10, - round = 4, - choose_better = False, #added in pycaret==2.0.0 - optimize = 'Accuracy', #added in pycaret==2.0.0 - method = 'hard', - turbo = True, - verbose = True): - - """ - - Description: - ------------ - This function creates a Soft Voting / Majority Rule classifier for all the - estimators in the model library (excluding the few when turbo is True) or - for specific trained estimators passed as a list in estimator_list param. - It scores it using Stratified Cross Validation. The output prints a score - grid that shows Accuracy, AUC, Recall, Precision, F1, Kappa and MCC by - fold (default CV = 10 Folds). - - This function returns a trained model object. - - Example: - -------- - from pycaret.datasets import get_data - juice = get_data('juice') - experiment_name = setup(data = juice, target = 'Purchase') - - blend_all = blend_models() - - This will create a VotingClassifier for all models in the model library - except for 'rbfsvm', 'gpc' and 'mlp'. - - For specific models, you can use: - - lr = create_model('lr') - rf = create_model('rf') - knn = create_model('knn') - - blend_three = blend_models(estimator_list = [lr,rf,knn]) - - This will create a VotingClassifier of lr, rf and knn. - - Parameters - ---------- - estimator_list : string ('All') or list of object, default = 'All' - - fold: integer, default = 10 - Number of folds to be used in Kfold CV. Must be at least 2. - - round: integer, default = 4 - Number of decimal places the metrics in the score grid will be rounded to. - - choose_better: Boolean, default = False - When set to set to True, base estimator is returned when the metric doesn't - improve by ensemble_model. This gurantees the returned object would perform - atleast equivalent to base estimator created using create_model or model - returned by compare_models. - - optimize: string, default = 'Accuracy' - Only used when choose_better is set to True. optimize parameter is used - to compare emsembled model with base estimator. Values accepted in - optimize parameter are 'Accuracy', 'AUC', 'Recall', 'Precision', 'F1', - 'Kappa', 'MCC'. - - method: string, default = 'hard' - 'hard' uses predicted class labels for majority rule voting.'soft', predicts - the class label based on the argmax of the sums of the predicted probabilities, - which is recommended for an ensemble of well-calibrated classifiers. - - turbo: Boolean, default = True - When turbo is set to True, it blacklists estimator that uses Radial Kernel. - - verbose: Boolean, default = True - Score grid is not printed when verbose is set to False. - - Returns: - -------- - - score grid: A table containing the scores of the model across the kfolds. - ----------- Scoring metrics used are Accuracy, AUC, Recall, Precision, F1, - Kappa and MCC. Mean and standard deviation of the scores across - the folds are also returned. - - model: trained Voting Classifier model object. - ----------- - - Warnings: - --------- - - When passing estimator_list with method set to 'soft'. All the models in the - estimator_list must support predict_proba function. 'svm' and 'ridge' doesnt - support the predict_proba and hence an exception will be raised. - - - When estimator_list is set to 'All' and method is forced to 'soft', estimators - that doesnt support the predict_proba function will be dropped from the estimator - list. - - - CatBoost Classifier not supported in blend_models(). - - - If target variable is multiclass (more than 2 classes), AUC will be returned as - zero (0.0). - - - - """ - - - ''' - - ERROR HANDLING STARTS HERE - - ''' - - import logging - - try: - hasattr(logger, 'name') - except: - logger = logging.getLogger('logs') - logger.setLevel(logging.DEBUG) - - # create console handler and set level to debug - if logger.hasHandlers(): - logger.handlers.clear() - - ch = logging.FileHandler('logs.log') - ch.setLevel(logging.DEBUG) - - # create formatter - formatter = logging.Formatter('%(asctime)s:%(levelname)s:%(message)s') - - # add formatter to ch - ch.setFormatter(formatter) - - # add ch to logger - logger.addHandler(ch) - - logger.info("Initializing blend_models()") - logger.info("""blend_models(estimator_list={}, fold={}, round={}, choose_better={}, optimize={}, method={}, turbo={}, verbose={})""".\ - format(str(estimator_list), str(fold), str(round), str(choose_better), str(optimize), str(method), str(turbo), str(verbose))) - - logger.info("Checking exceptions") - - #exception checking - import sys - - #run_time - import datetime, time - runtime_start = time.time() - - #checking error for estimator_list (string) - - if estimator_list != 'All': - if type(estimator_list) is not list: - sys.exit("(Value Error): estimator_list parameter only accepts 'All' as string or list of trained models.") - - if estimator_list != 'All': - for i in estimator_list: - if 'sklearn' not in str(type(i)) and 'CatBoostClassifier' not in str(type(i)): - sys.exit("(Value Error): estimator_list parameter only accepts 'All' as string or trained model object.") - - #checking method param with estimator list - if estimator_list != 'All': - if method == 'soft': - - check = 0 - - for i in estimator_list: - if hasattr(i, 'predict_proba'): - pass - else: - check += 1 - - if check >= 1: - sys.exit('(Type Error): Estimator list contains estimator that doesnt support probabilities and method is forced to soft. Either change the method or drop the estimator.') - - #checking catboost: - if estimator_list != 'All': - for i in estimator_list: - if 'CatBoostClassifier' in str(i): - sys.exit('(Type Error): CatBoost Classifier not supported in this function.') - - #checking fold parameter - if type(fold) is not int: - sys.exit('(Type Error): Fold parameter only accepts integer value.') - - #checking round parameter - if type(round) is not int: - sys.exit('(Type Error): Round parameter only accepts integer value.') - - #checking method parameter - available_method = ['soft', 'hard'] - if method not in available_method: - sys.exit("(Value Error): Method parameter only accepts 'soft' or 'hard' as a parameter. See Docstring for details.") - - #checking verbose parameter - if type(turbo) is not bool: - sys.exit('(Type Error): Turbo parameter can only take argument as True or False.') - - #checking verbose parameter - if type(verbose) is not bool: - sys.exit('(Type Error): Verbose parameter can only take argument as True or False.') - - ''' - - ERROR HANDLING ENDS HERE - - ''' - - logger.info("Preloading libraries") - #pre-load libraries - import pandas as pd - import ipywidgets as ipw - from IPython.display import display, HTML, clear_output, update_display - - logger.info("Preparing display monitor") - #progress bar - progress = ipw.IntProgress(value=0, min=0, max=fold+4, step=1 , description='Processing: ') - master_display = pd.DataFrame(columns=['Accuracy','AUC','Recall', 'Prec.', 'F1', 'Kappa', 'MCC']) - if verbose: - if html_param: - display(progress) - - #display monitor - timestampStr = datetime.datetime.now().strftime("%H:%M:%S") - monitor = pd.DataFrame( [ ['Initiated' , '. . . . . . . . . . . . . . . . . .', timestampStr ], - ['Status' , '. . . . . . . . . . . . . . . . . .' , 'Loading Dependencies' ], - ['ETC' , '. . . . . . . . . . . . . . . . . .', 'Calculating ETC'] ], - columns=['', ' ', ' ']).set_index('') - - if verbose: - if html_param: - display(monitor, display_id = 'monitor') - - if verbose: - if html_param: - display_ = display(master_display, display_id=True) - display_id = display_.display_id - - #ignore warnings - import warnings - warnings.filterwarnings('ignore') - - logger.info("Importing libraries") - #general dependencies - import numpy as np - from sklearn import metrics - from sklearn.model_selection import StratifiedKFold - from sklearn.ensemble import VotingClassifier - import re - - logger.info("Copying training dataset") - #Storing X_train and y_train in data_X and data_y parameter - data_X = X_train.copy() - data_y = y_train.copy() - - #reset index - data_X.reset_index(drop=True, inplace=True) - data_y.reset_index(drop=True, inplace=True) - - if optimize == 'Accuracy': - compare_dimension = 'Accuracy' - elif optimize == 'AUC': - compare_dimension = 'AUC' - elif optimize == 'Recall': - compare_dimension = 'Recall' - elif optimize == 'Precision': - compare_dimension = 'Prec.' - elif optimize == 'F1': - compare_dimension = 'F1' - elif optimize == 'Kappa': - compare_dimension = 'Kappa' - elif optimize == 'MCC': - compare_dimension = 'MCC' - - #estimator_list_flag - if estimator_list == 'All': - all_flag = True - else: - all_flag = False - - progress.value += 1 - - logger.info("Declaring metric variables") - score_auc =np.empty((0,0)) - score_acc =np.empty((0,0)) - score_recall =np.empty((0,0)) - score_precision =np.empty((0,0)) - score_f1 =np.empty((0,0)) - score_kappa =np.empty((0,0)) - score_mcc =np.empty((0,0)) - score_training_time =np.empty((0,0)) - - avgs_auc =np.empty((0,0)) - avgs_acc =np.empty((0,0)) - avgs_recall =np.empty((0,0)) - avgs_precision =np.empty((0,0)) - avgs_f1 =np.empty((0,0)) - avgs_kappa =np.empty((0,0)) - avgs_mcc =np.empty((0,0)) - avgs_training_time =np.empty((0,0)) - - avg_acc = np.empty((0,0)) - avg_auc = np.empty((0,0)) - avg_recall = np.empty((0,0)) - avg_precision = np.empty((0,0)) - avg_f1 = np.empty((0,0)) - avg_kappa = np.empty((0,0)) - avg_mcc = np.empty((0,0)) - avg_training_time = np.empty((0,0)) - - logger.info("Defining folds") - kf = StratifiedKFold(fold, random_state=seed, shuffle=folds_shuffle_param) - - ''' - MONITOR UPDATE STARTS - ''' - - monitor.iloc[1,1:] = 'Compiling Estimators' - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - ''' - MONITOR UPDATE ENDS - ''' - - if estimator_list == 'All': - - logger.info("Importing untrained models") - from sklearn.linear_model import LogisticRegression - from sklearn.neighbors import KNeighborsClassifier - from sklearn.naive_bayes import GaussianNB - from sklearn.tree import DecisionTreeClassifier - from sklearn.linear_model import SGDClassifier - from sklearn.svm import SVC - from sklearn.gaussian_process import GaussianProcessClassifier - from sklearn.neural_network import MLPClassifier - from sklearn.linear_model import RidgeClassifier - from sklearn.ensemble import RandomForestClassifier - from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis - from sklearn.ensemble import AdaBoostClassifier - from sklearn.ensemble import GradientBoostingClassifier - from sklearn.discriminant_analysis import LinearDiscriminantAnalysis - from sklearn.ensemble import ExtraTreesClassifier - from sklearn.ensemble import BaggingClassifier - from xgboost import XGBClassifier - import lightgbm as lgb - - lr = LogisticRegression(random_state=seed) #don't add n_jobs parameter as it slows down the LR - knn = KNeighborsClassifier(n_jobs=n_jobs_param) - nb = GaussianNB() - dt = DecisionTreeClassifier(random_state=seed) - svm = SGDClassifier(max_iter=1000, tol=0.001, random_state=seed, n_jobs=n_jobs_param) - rbfsvm = SVC(gamma='auto', C=1, probability=True, kernel='rbf', random_state=seed) - gpc = GaussianProcessClassifier(random_state=seed, n_jobs=n_jobs_param) - mlp = MLPClassifier(max_iter=500, random_state=seed) - ridge = RidgeClassifier(random_state=seed) - rf = RandomForestClassifier(n_estimators=10, random_state=seed, n_jobs=n_jobs_param) - qda = QuadraticDiscriminantAnalysis() - ada = AdaBoostClassifier(random_state=seed) - gbc = GradientBoostingClassifier(random_state=seed) - lda = LinearDiscriminantAnalysis() - et = ExtraTreesClassifier(random_state=seed, n_jobs=n_jobs_param) - xgboost = XGBClassifier(random_state=seed, verbosity=0, n_jobs=n_jobs_param) - lightgbm = lgb.LGBMClassifier(random_state=seed, n_jobs=n_jobs_param) - - logger.info("Import successful") - - progress.value += 1 - - logger.info("Defining estimator list") - if turbo: - if method == 'hard': - estimator_list = [lr,knn,nb,dt,svm,ridge,rf,qda,ada,gbc,lda,et,xgboost,lightgbm] - voting = 'hard' - elif method == 'soft': - estimator_list = [lr,knn,nb,dt,rf,qda,ada,gbc,lda,et,xgboost,lightgbm] - voting = 'soft' - else: - if method == 'hard': - estimator_list = [lr,knn,nb,dt,svm,rbfsvm,gpc,mlp,ridge,rf,qda,ada,gbc,lda,et,xgboost,lightgbm] - voting = 'hard' - elif method == 'soft': - estimator_list = [lr,knn,nb,dt,rbfsvm,gpc,mlp,rf,qda,ada,gbc,lda,et,xgboost,lightgbm] - voting = 'soft' - - else: - - estimator_list = estimator_list - voting = method - - logger.info("Defining model names in estimator_list") - model_names = [] - - for names in estimator_list: - - model_names = np.append(model_names, str(names).split("(")[0]) - - def putSpace(input): - words = re.findall('[A-Z][a-z]*', input) - words = ' '.join(words) - return words - - model_names_modified = [] - - for i in model_names: - - model_names_modified.append(putSpace(i)) - model_names = model_names_modified - - model_names_final = [] - - for j in model_names_modified: - - if j == 'Gaussian N B': - model_names_final.append('Naive Bayes') - - elif j == 'M L P Classifier': - model_names_final.append('MLP Classifier') - - elif j == 'S G D Classifier': - model_names_final.append('SVM - Linear Kernel') - - elif j == 'S V C': - model_names_final.append('SVM - Radial Kernel') - - elif j == 'X G B Classifier': - model_names_final.append('Extreme Gradient Boosting') - - elif j == 'L G B M Classifier': - model_names_final.append('Light Gradient Boosting Machine') - - else: - model_names_final.append(j) - - model_names = model_names_final - - #adding n in model_names to avoid duplicate exception when custom list is passed for eg. BaggingClassifier - - model_names_n = [] - counter = 0 - - for i in model_names: - mn = str(i) + '_' + str(counter) - model_names_n.append(mn) - counter += 1 - - model_names = model_names_n - - estimator_list = estimator_list - - estimator_list_ = zip(model_names, estimator_list) - estimator_list_ = set(estimator_list_) - estimator_list_ = list(estimator_list_) - - try: - model = VotingClassifier(estimators=estimator_list_, voting=voting, n_jobs=n_jobs_param) - model.fit(data_X,data_y) - logger.info("n_jobs multiple passed") - except: - logger.info("n_jobs multiple failed") - model = VotingClassifier(estimators=estimator_list_, voting=voting) - - progress.value += 1 - - ''' - MONITOR UPDATE STARTS - ''' - - monitor.iloc[1,1:] = 'Initializing CV' - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - ''' - MONITOR UPDATE ENDS - ''' - - fold_num = 1 - - for train_i , test_i in kf.split(data_X,data_y): - - logger.info("Initializing Fold " + str(fold_num)) - - progress.value += 1 - - t0 = time.time() - - ''' - MONITOR UPDATE STARTS - ''' - - monitor.iloc[1,1:] = 'Fitting Fold ' + str(fold_num) + ' of ' + str(fold) - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - ''' - MONITOR UPDATE ENDS - ''' - - Xtrain,Xtest = data_X.iloc[train_i], data_X.iloc[test_i] - ytrain,ytest = data_y.iloc[train_i], data_y.iloc[test_i] - time_start=time.time() - - if fix_imbalance_param: - logger.info("Initializing SMOTE") - if fix_imbalance_method_param is None: - from imblearn.over_sampling import SMOTE - resampler = SMOTE(random_state = seed) - else: - resampler = fix_imbalance_method_param - - Xtrain,ytrain = resampler.fit_sample(Xtrain, ytrain) - logger.info("Resampling completed") - - if voting == 'hard': - logger.info("Fitting Model") - model.fit(Xtrain,ytrain) - logger.info("Evaluating Metrics") - pred_prob = 0.0 - logger.warning("model has no predict_proba attribute. pred_prob set to 0.00") - pred_ = model.predict(Xtest) - sca = metrics.accuracy_score(ytest,pred_) - sc = 0.0 - if y.value_counts().count() > 2: - recall = metrics.recall_score(ytest,pred_, average='macro') - precision = metrics.precision_score(ytest,pred_, average='weighted') - f1 = metrics.f1_score(ytest,pred_, average='weighted') - else: - recall = metrics.recall_score(ytest,pred_) - precision = metrics.precision_score(ytest,pred_) - f1 = metrics.f1_score(ytest,pred_) - - else: - logger.info("Fitting Model") - model.fit(Xtrain,ytrain) - logger.info("Evaluating Metrics") - pred_ = model.predict(Xtest) - sca = metrics.accuracy_score(ytest,pred_) - - if y.value_counts().count() > 2: - pred_prob = 0 - sc = 0 - recall = metrics.recall_score(ytest,pred_, average='macro') - precision = metrics.precision_score(ytest,pred_, average='weighted') - f1 = metrics.f1_score(ytest,pred_, average='weighted') - else: - try: - pred_prob = model.predict_proba(Xtest) - pred_prob = pred_prob[:,1] - sc = metrics.roc_auc_score(ytest,pred_prob) - except: - sc = 0 - recall = metrics.recall_score(ytest,pred_) - precision = metrics.precision_score(ytest,pred_) - f1 = metrics.f1_score(ytest,pred_) - - logger.info("Compiling Metrics") - time_end=time.time() - kappa = metrics.cohen_kappa_score(ytest,pred_) - mcc = metrics.matthews_corrcoef(ytest,pred_) - training_time=time_end-time_start - score_acc = np.append(score_acc,sca) - score_auc = np.append(score_auc,sc) - score_recall = np.append(score_recall,recall) - score_precision = np.append(score_precision,precision) - score_f1 =np.append(score_f1,f1) - score_kappa =np.append(score_kappa,kappa) - score_mcc =np.append(score_mcc,mcc) - score_training_time =np.append(score_training_time,training_time) - - - ''' - - This section handles time calculation and is created to update_display() as code loops through - the fold defined. - - ''' - - fold_results = pd.DataFrame({'Accuracy':[sca], 'AUC': [sc], 'Recall': [recall], - 'Prec.': [precision], 'F1': [f1], 'Kappa': [kappa], 'MCC':[mcc]}).round(round) - master_display = pd.concat([master_display, fold_results],ignore_index=True) - fold_results = [] - - ''' - TIME CALCULATION SUB-SECTION STARTS HERE - ''' - t1 = time.time() - - tt = (t1 - t0) * (fold-fold_num) / 60 - tt = np.around(tt, 2) - - if tt < 1: - tt = str(np.around((tt * 60), 2)) - ETC = tt + ' Seconds Remaining' - - else: - tt = str (tt) - ETC = tt + ' Minutes Remaining' - - fold_num += 1 - - ''' - MONITOR UPDATE STARTS - ''' - - monitor.iloc[2,1:] = ETC - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - ''' - MONITOR UPDATE ENDS - ''' - - ''' - TIME CALCULATION ENDS HERE - ''' - - if verbose: - if html_param: - update_display(master_display, display_id = display_id) - - - ''' - - Update_display() ends here - - ''' - logger.info("Calculating mean and std") - mean_acc=np.mean(score_acc) - mean_auc=np.mean(score_auc) - mean_recall=np.mean(score_recall) - mean_precision=np.mean(score_precision) - mean_f1=np.mean(score_f1) - mean_kappa=np.mean(score_kappa) - mean_mcc=np.mean(score_mcc) - mean_training_time=np.sum(score_training_time) - std_acc=np.std(score_acc) - std_auc=np.std(score_auc) - std_recall=np.std(score_recall) - std_precision=np.std(score_precision) - std_f1=np.std(score_f1) - std_kappa=np.std(score_kappa) - std_mcc=np.std(score_mcc) - std_training_time=np.std(score_training_time) - - avgs_acc = np.append(avgs_acc, mean_acc) - avgs_acc = np.append(avgs_acc, std_acc) - avgs_auc = np.append(avgs_auc, mean_auc) - avgs_auc = np.append(avgs_auc, std_auc) - avgs_recall = np.append(avgs_recall, mean_recall) - avgs_recall = np.append(avgs_recall, std_recall) - avgs_precision = np.append(avgs_precision, mean_precision) - avgs_precision = np.append(avgs_precision, std_precision) - avgs_f1 = np.append(avgs_f1, mean_f1) - avgs_f1 = np.append(avgs_f1, std_f1) - avgs_kappa = np.append(avgs_kappa, mean_kappa) - avgs_kappa = np.append(avgs_kappa, std_kappa) - - avgs_mcc = np.append(avgs_mcc, mean_mcc) - avgs_mcc = np.append(avgs_mcc, std_mcc) - avgs_training_time = np.append(avgs_training_time, mean_training_time) - avgs_training_time = np.append(avgs_training_time, std_training_time) - - progress.value += 1 - - logger.info("Creating metrics dataframe") - model_results = pd.DataFrame({'Accuracy': score_acc, 'AUC': score_auc, 'Recall' : score_recall, 'Prec.' : score_precision , - 'F1' : score_f1, 'Kappa' : score_kappa, 'MCC' : score_mcc}) - model_avgs = pd.DataFrame({'Accuracy': avgs_acc, 'AUC': avgs_auc, 'Recall' : avgs_recall, 'Prec.' : avgs_precision , - 'F1' : avgs_f1, 'Kappa' : avgs_kappa, 'MCC' : avgs_mcc},index=['Mean', 'SD']) - model_results = model_results.append(model_avgs) - model_results = model_results.round(round) - - # yellow the mean - model_results=model_results.style.apply(lambda x: ['background: yellow' if (x.name == 'Mean') else '' for i in x], axis=1) - model_results = model_results.set_precision(round) - - progress.value += 1 - - #refitting the model on complete X_train, y_train - monitor.iloc[1,1:] = 'Finalizing Model' - monitor.iloc[2,1:] = 'Almost Finished' - - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - model_fit_start = time.time() - logger.info("Finalizing model") - model.fit(data_X, data_y) - model_fit_end = time.time() - - model_fit_time = np.array(model_fit_end - model_fit_start).round(2) - - progress.value += 1 - - #storing results in create_model_container - logger.info("Uploading results into container") - create_model_container.append(model_results.data) - display_container.append(model_results.data) - - #storing results in master_model_container - logger.info("Uploading model into container") - master_model_container.append(model) - - ''' - When choose_better sets to True. optimize metric in scoregrid is - compared with base model created using create_model so that stack_models - functions return the model with better score only. This will ensure - model performance is atleast equivalent to what is seen in compare_models - ''' - - scorer = [] - - blend_model_results = create_model_container[-1][compare_dimension][-2:][0] - - scorer.append(blend_model_results) - - if choose_better and all_flag is False: - logger.info("choose_better activated") - if verbose: - if html_param: - monitor.iloc[1,1:] = 'Compiling Final Results' - monitor.iloc[2,1:] = 'Almost Finished' - update_display(monitor, display_id = 'monitor') - - base_models_ = [] - logger.info("SubProcess create_model() called ==================================") - for i in estimator_list: - m = create_model(i,verbose=False, system=False) - s = create_model_container[-1][compare_dimension][-2:][0] - scorer.append(s) - base_models_.append(m) - - #re-instate display_constainer state - display_container.pop(-1) - - logger.info("SubProcess create_model() called ==================================") - logger.info("choose_better completed") - - index_scorer = scorer.index(max(scorer)) - - if index_scorer == 0: - model = model - else: - model = base_models_[index_scorer-1] - - #end runtime - runtime_end = time.time() - runtime = np.array(runtime_end - runtime_start).round(2) - - if logging_param: - - logger.info("Creating MLFlow logs") - - #Creating Logs message monitor - monitor.iloc[1,1:] = 'Creating Logs' - monitor.iloc[2,1:] = 'Almost Finished' - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - import mlflow - from pathlib import Path - import os - - with mlflow.start_run(run_name='Voting Classifier') as run: - - # Get active run to log as tag - RunID = mlflow.active_run().info.run_id - - mlflow.log_metrics({"Accuracy": avgs_acc[0], "AUC": avgs_auc[0], "Recall": avgs_recall[0], "Precision" : avgs_precision[0], - "F1": avgs_f1[0], "Kappa": avgs_kappa[0], "MCC": avgs_mcc[0]}) - - # Log model and transformation pipeline - logger.info("SubProcess save_model() called ==================================") - save_model(model, 'Trained Model', verbose=False) - logger.info("SubProcess save_model() end ==================================") - mlflow.log_artifact('Trained Model' + '.pkl') - size_bytes = Path('Trained Model.pkl').stat().st_size - size_kb = np.round(size_bytes/1000, 2) - mlflow.set_tag("Size KB", size_kb) - os.remove('Trained Model.pkl') - - # Generate hold-out predictions and save as html - holdout = predict_model(model, verbose=False) - holdout_score = pull() - display_container.pop(-1) - holdout_score.to_html('Holdout.html', col_space=65, justify='left') - mlflow.log_artifact('Holdout.html') - os.remove('Holdout.html') - - #set tag of compare_models - mlflow.set_tag("Source", "blend_models") - - import secrets - URI = secrets.token_hex(nbytes=4) - mlflow.set_tag("URI", URI) - mlflow.set_tag("USI", USI) - mlflow.set_tag("Run Time", runtime) - mlflow.set_tag("Run ID", RunID) - - # Log training time of compare_models - mlflow.log_metric("TT", model_fit_time) - - # Log AUC and Confusion Matrix plot - if log_plots_param: - - logger.info("SubProcess plot_model() called ==================================") - - try: - plot_model(model, plot = 'confusion_matrix', verbose=False, save=True, system=False) - mlflow.log_artifact('Confusion Matrix.png') - os.remove("Confusion Matrix.png") - except: - pass - - logger.info("SubProcess plot_model() end ==================================") - - # Log the CV results as model_results.html artifact - model_results.data.to_html('Results.html', col_space=65, justify='left') - mlflow.log_artifact('Results.html') - os.remove('Results.html') - - if verbose: - clear_output() - if html_param: - display(model_results) - else: - print(model_results.data) - - logger.info("create_model_container: " + str(len(create_model_container))) - logger.info("master_model_container: " + str(len(master_model_container))) - logger.info("display_container: " + str(len(display_container))) - - logger.info(str(model)) - logger.info("blend_models() succesfully completed......................................") - - return model - -def stack_models(estimator_list, - meta_model = None, - fold = 10, - round = 4, - method = 'soft', - restack = True, - plot = False, - choose_better = False, #added in pycaret==2.0.0 - optimize = 'Accuracy', #added in pycaret==2.0.0 - finalize = False, - verbose = True): - - """ - - Description: - ------------ - This function creates a meta model and scores it using Stratified Cross Validation. - The predictions from the base level models as passed in the estimator_list param - are used as input features for the meta model. The restacking parameter controls - the ability to expose raw features to the meta model when set to True - (default = False). - - The output prints the score grid that shows Accuracy, AUC, Recall, Precision, - F1, Kappa and MCC by fold (default = 10 Folds). - - This function returns a container which is the list of all models in stacking. - - Example: - -------- - from pycaret.datasets import get_data - juice = get_data('juice') - experiment_name = setup(data = juice, target = 'Purchase') - dt = create_model('dt') - rf = create_model('rf') - ada = create_model('ada') - ridge = create_model('ridge') - knn = create_model('knn') - - stacked_models = stack_models(estimator_list=[dt,rf,ada,ridge,knn]) - - This will create a meta model that will use the predictions of all the - models provided in estimator_list param. By default, the meta model is - Logistic Regression but can be changed with meta_model param. - - Parameters - ---------- - estimator_list : list of objects - - meta_model : object, default = None - if set to None, Logistic Regression is used as a meta model. - - fold: integer, default = 10 - Number of folds to be used in Kfold CV. Must be at least 2. - - round: integer, default = 4 - Number of decimal places the metrics in the score grid will be rounded to. - - method: string, default = 'soft' - 'soft', uses predicted probabilities as an input to the meta model. - 'hard', uses predicted class labels as an input to the meta model. - - restack: Boolean, default = True - When restack is set to True, raw data will be exposed to meta model when - making predictions, otherwise when False, only the predicted label or - probabilities is passed to meta model when making final predictions. - - plot: Boolean, default = False - When plot is set to True, it will return the correlation plot of prediction - from all base models provided in estimator_list. - - choose_better: Boolean, default = False - When set to set to True, base estimator is returned when the metric doesn't - improve by ensemble_model. This gurantees the returned object would perform - atleast equivalent to base estimator created using create_model or model - returned by compare_models. - - optimize: string, default = 'Accuracy' - Only used when choose_better is set to True. optimize parameter is used - to compare emsembled model with base estimator. Values accepted in - optimize parameter are 'Accuracy', 'AUC', 'Recall', 'Precision', 'F1', - 'Kappa', 'MCC'. - - finalize: Boolean, default = False - When finalize is set to True, it will fit the stacker on entire dataset - including the hold-out sample created during the setup() stage. It is not - recommended to set this to True here, If you would like to fit the stacker - on the entire dataset including the hold-out, use finalize_model(). - - verbose: Boolean, default = True - Score grid is not printed when verbose is set to False. - - Returns: - -------- - - score grid: A table containing the scores of the model across the kfolds. - ----------- Scoring metrics used are Accuracy, AUC, Recall, Precision, F1, - Kappa and MCC. Mean and standard deviation of the scores across - the folds are also returned. - - container: list of all the models where last element is meta model. - ---------- - - Warnings: - --------- - - When the method is forced to be 'soft' and estimator_list param includes - estimators that donot support the predict_proba method such as 'svm' or - 'ridge', predicted values for those specific estimators only are used - instead of probability when building the meta_model. The same rule applies - when the stacker is used under predict_model() function. - - - If target variable is multiclass (more than 2 classes), AUC will be returned - as zero (0.0). - - - method 'soft' not supported for when target is multiclass. - - - """ - - ''' - - ERROR HANDLING STARTS HERE - - ''' - - import logging - - try: - hasattr(logger, 'name') - except: - logger = logging.getLogger('logs') - logger.setLevel(logging.DEBUG) - - # create console handler and set level to debug - if logger.hasHandlers(): - logger.handlers.clear() - - ch = logging.FileHandler('logs.log') - ch.setLevel(logging.DEBUG) - - # create formatter - formatter = logging.Formatter('%(asctime)s:%(levelname)s:%(message)s') - - # add formatter to ch - ch.setFormatter(formatter) - - # add ch to logger - logger.addHandler(ch) - - logger.info("Initializing stack_models()") - logger.info("""stack_models(estimator_list={}, meta_model={}, fold={}, round={}, method={}, restack={}, plot={}, choose_better={}, optimize={}, finalize={}, verbose={})""".\ - format(str(estimator_list), str(meta_model), str(fold), str(round), str(method), str(restack), str(plot), str(choose_better), str(optimize), str(finalize), str(verbose))) - - logger.info("Checking exceptions") - - #exception checking - import sys - - #run_time - import datetime, time - runtime_start = time.time() - - #change method param to 'hard' for multiclass - if y.value_counts().count() > 2: - method = 'hard' - - #checking error for estimator_list - for i in estimator_list: - if 'sklearn' not in str(type(i)) and 'CatBoostClassifier' not in str(type(i)): - sys.exit("(Value Error): estimator_list parameter only trained model object") - - #checking meta model - if meta_model is not None: - if 'sklearn' not in str(type(meta_model)) and 'CatBoostClassifier' not in str(type(meta_model)): - sys.exit("(Value Error): estimator_list parameter only accepts trained model object") - - #stacking with multiclass - if y.value_counts().count() > 2: - if method == 'soft': - sys.exit("(Type Error): method 'soft' not supported for multiclass problems.") - - #checking fold parameter - if type(fold) is not int: - sys.exit('(Type Error): Fold parameter only accepts integer value.') - - #checking round parameter - if type(round) is not int: - sys.exit('(Type Error): Round parameter only accepts integer value.') - - #checking method parameter - available_method = ['soft', 'hard'] - if method not in available_method: - sys.exit("(Value Error): Method parameter only accepts 'soft' or 'hard' as a parameter. See Docstring for details.") - - #checking restack parameter - if type(restack) is not bool: - sys.exit('(Type Error): Restack parameter can only take argument as True or False.') - - #checking plot parameter - if type(restack) is not bool: - sys.exit('(Type Error): Plot parameter can only take argument as True or False.') - - #checking verbose parameter - if type(verbose) is not bool: - sys.exit('(Type Error): Verbose parameter can only take argument as True or False.') - - ''' - - ERROR HANDLING ENDS HERE - - ''' - - logger.info("Preloading libraries") - #pre-load libraries - import pandas as pd - import ipywidgets as ipw - from IPython.display import display, HTML, clear_output, update_display - from copy import deepcopy - from sklearn.base import clone - - logger.info("Copying estimator list") - #copy estimator_list - estimator_list = deepcopy(estimator_list) - - logger.info("Defining meta model") - #Defining meta model. - if meta_model == None: - from sklearn.linear_model import LogisticRegression - meta_model = LogisticRegression() - else: - meta_model = deepcopy(meta_model) - - clear_output() - - import warnings - warnings.filterwarnings('default') - warnings.warn('This function will adopt to Stackingclassifer() from sklearn in future release of PyCaret 2.x.') - warnings.filterwarnings('ignore') - - if optimize == 'Accuracy': - compare_dimension = 'Accuracy' - elif optimize == 'AUC': - compare_dimension = 'AUC' - elif optimize == 'Recall': - compare_dimension = 'Recall' - elif optimize == 'Precision': - compare_dimension = 'Prec.' - elif optimize == 'F1': - compare_dimension = 'F1' - elif optimize == 'Kappa': - compare_dimension = 'Kappa' - elif optimize == 'MCC': - compare_dimension = 'MCC' - - logger.info("Preparing display monitor") - #progress bar - max_progress = len(estimator_list) + fold + 4 - progress = ipw.IntProgress(value=0, min=0, max=max_progress, step=1 , description='Processing: ') - master_display = pd.DataFrame(columns=['Accuracy','AUC','Recall', 'Prec.', 'F1', 'Kappa', 'MCC']) - if verbose: - if html_param: - display(progress) - - #display monitor - timestampStr = datetime.datetime.now().strftime("%H:%M:%S") - monitor = pd.DataFrame( [ ['Initiated' , '. . . . . . . . . . . . . . . . . .', timestampStr ], - ['Status' , '. . . . . . . . . . . . . . . . . .' , 'Loading Dependencies' ], - ['ETC' , '. . . . . . . . . . . . . . . . . .', 'Calculating ETC'] ], - columns=['', ' ', ' ']).set_index('') - - if verbose: - if html_param: - display(monitor, display_id = 'monitor') - - if verbose: - if html_param: - display_ = display(master_display, display_id=True) - display_id = display_.display_id - - logger.info("Importing libraries") - #dependencies - import numpy as np - from sklearn import metrics - from sklearn.model_selection import StratifiedKFold - from sklearn.model_selection import cross_val_predict - import seaborn as sns - import matplotlib.pyplot as plt - - progress.value += 1 - - #Capturing the method of stacking required by user. method='soft' means 'predict_proba' else 'predict' - if method == 'soft': - predict_method = 'predict_proba' - elif method == 'hard': - predict_method = 'predict' - - logger.info("Copying training dataset") - #defining data_X and data_y - if finalize: - data_X = X.copy() - data_y = y.copy() - else: - data_X = X_train.copy() - data_y = y_train.copy() - - #reset index - data_X.reset_index(drop=True,inplace=True) - data_y.reset_index(drop=True,inplace=True) - - #models_ for appending - models_ = [] - - logger.info("Getting model names") - #defining model_library model names - model_names = np.zeros(0) - for item in estimator_list: - model_names = np.append(model_names, str(item).split("(")[0]) - - model_names_fixed = [] - - for i in model_names: - if 'CatBoostClassifier' in i: - a = 'CatBoostClassifier' - model_names_fixed.append(a) - else: - model_names_fixed.append(i) - - model_names = model_names_fixed - - model_names_fixed = [] - - counter = 0 - for i in model_names: - s = str(i) + '_' + str(counter) - model_names_fixed.append(s) - counter += 1 - - base_array = np.zeros((0,0)) - base_prediction = pd.DataFrame(data_y) #changed to data_y - base_prediction = base_prediction.reset_index(drop=True) - - counter = 0 - - model_fit_start = time.time() - - for model in estimator_list: - - logger.info("Checking base model : " + str(model_names[counter])) - - ''' - MONITOR UPDATE STARTS - ''' - - monitor.iloc[1,1:] = 'Evaluating ' + model_names[counter] - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - ''' - MONITOR UPDATE ENDS - ''' - - #fitting and appending - logger.info("Fitting base model") - model.fit(data_X, data_y) - models_.append(model) - - progress.value += 1 - - logger.info("Generating cross val predictions") - try: - base_array = cross_val_predict(model,data_X,data_y,cv=fold, method=predict_method) - except: - base_array = cross_val_predict(model,data_X,data_y,cv=fold, method='predict') - if method == 'soft': - try: - base_array = base_array[:,1] - except: - base_array = base_array - elif method == 'hard': - base_array = base_array - base_array_df = pd.DataFrame(base_array) - base_prediction = pd.concat([base_prediction,base_array_df],axis=1) - base_array = np.empty((0,0)) - - counter += 1 - - logger.info("Base layer complete") - - #fill nas for base_prediction - base_prediction.fillna(value=0, inplace=True) - - #defining column names now - target_col_name = np.array(base_prediction.columns[0]) - model_names = np.append(target_col_name, model_names_fixed) #added fixed here - base_prediction.columns = model_names #defining colum names now - - #defining data_X and data_y dataframe to be used in next stage. - - #drop column from base_prediction - base_prediction.drop(base_prediction.columns[0],axis=1,inplace=True) - - if restack: - data_X = pd.concat([data_X, base_prediction], axis=1) - - else: - data_X = base_prediction - - #Correlation matrix of base_prediction - #base_prediction_cor = base_prediction.drop(base_prediction.columns[0],axis=1) - base_prediction_cor = base_prediction.corr() - - #Meta Modeling Starts Here - model = meta_model #this defines model to be used below as model = meta_model (as captured above) - - #appending in models - model.fit(data_X, data_y) - models_.append(model) - - logger.info("Defining folds") - kf = StratifiedKFold(fold, random_state=seed, shuffle=folds_shuffle_param) #capturing fold requested by user - - score_auc =np.empty((0,0)) - score_acc =np.empty((0,0)) - score_recall =np.empty((0,0)) - score_precision =np.empty((0,0)) - score_f1 =np.empty((0,0)) - score_kappa =np.empty((0,0)) - score_mcc =np.empty((0,0)) - score_training_time =np.empty((0,0)) - avgs_auc =np.empty((0,0)) - avgs_acc =np.empty((0,0)) - avgs_recall =np.empty((0,0)) - avgs_precision =np.empty((0,0)) - avgs_f1 =np.empty((0,0)) - avgs_kappa =np.empty((0,0)) - avgs_mcc =np.empty((0,0)) - avgs_training_time =np.empty((0,0)) - - progress.value += 1 - - fold_num = 1 - - for train_i , test_i in kf.split(data_X,data_y): - - logger.info("Initializing Fold " + str(fold_num)) - - t0 = time.time() - - ''' - MONITOR UPDATE STARTS - ''' - - monitor.iloc[1,1:] = 'Fitting Meta Model Fold ' + str(fold_num) + ' of ' + str(fold) - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - ''' - MONITOR UPDATE ENDS - ''' - - progress.value += 1 - - Xtrain,Xtest = data_X.iloc[train_i], data_X.iloc[test_i] - ytrain,ytest = data_y.iloc[train_i], data_y.iloc[test_i] - - if fix_imbalance_param: - logger.info("Initializing SMOTE") - if fix_imbalance_method_param is None: - from imblearn.over_sampling import SMOTE - resampler = SMOTE(random_state = seed) - else: - resampler = fix_imbalance_method_param - - Xtrain,ytrain = resampler.fit_sample(Xtrain, ytrain) - logger.info("Resampling completed") - - time_start=time.time() - logger.info("Fitting Model") - model.fit(Xtrain,ytrain) - logger.info("Evaluating Metrics") - - try: - pred_prob = model.predict_proba(Xtest) - pred_prob = pred_prob[:,1] - except: - pass - pred_ = model.predict(Xtest) - sca = metrics.accuracy_score(ytest,pred_) - try: - sc = metrics.roc_auc_score(ytest,pred_prob) - except: - sc = 0 - - if y.value_counts().count() > 2: - recall = metrics.recall_score(ytest,pred_,average='macro') - precision = metrics.precision_score(ytest,pred_,average='weighted') - f1 = metrics.f1_score(ytest,pred_,average='weighted') - - else: - recall = metrics.recall_score(ytest,pred_) - precision = metrics.precision_score(ytest,pred_) - f1 = metrics.f1_score(ytest,pred_) - - logger.info("Compiling Metrics") - time_end=time.time() - kappa = metrics.cohen_kappa_score(ytest,pred_) - mcc = metrics.matthews_corrcoef(ytest,pred_) - training_time=time_end-time_start - score_acc = np.append(score_acc,sca) - score_auc = np.append(score_auc,sc) - score_recall = np.append(score_recall,recall) - score_precision = np.append(score_precision,precision) - score_f1 =np.append(score_f1,f1) - score_kappa =np.append(score_kappa,kappa) - score_mcc =np.append(score_mcc,mcc) - score_training_time =np.append(score_training_time,training_time) - - ''' - - This section handles time calculation and is created to update_display() as code loops through - the fold defined. - - ''' - - fold_results = pd.DataFrame({'Accuracy':[sca], 'AUC': [sc], 'Recall': [recall], - 'Prec.': [precision], 'F1': [f1], 'Kappa': [kappa], 'MCC':[mcc]}).round(round) - master_display = pd.concat([master_display, fold_results],ignore_index=True) - fold_results = [] - - - ''' - - TIME CALCULATION SUB-SECTION STARTS HERE - - ''' - - t1 = time.time() - - tt = (t1 - t0) * (fold-fold_num) / 60 - tt = np.around(tt, 2) - - if tt < 1: - tt = str(np.around((tt * 60), 2)) - ETC = tt + ' Seconds Remaining' - - else: - tt = str (tt) - ETC = tt + ' Minutes Remaining' - - ''' - MONITOR UPDATE STARTS - ''' - - monitor.iloc[2,1:] = ETC - - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - ''' - MONITOR UPDATE ENDS - ''' - - #update_display(ETC, display_id = 'ETC') - - fold_num += 1 - - - ''' - - TIME CALCULATION ENDS HERE - - ''' - - if verbose: - if html_param: - update_display(master_display, display_id = display_id) - - - ''' - - Update_display() ends here - - ''' - - model_fit_end = time.time() - model_fit_time = np.array(model_fit_end - model_fit_start).round(2) - - logger.info("Calculating mean and std") - mean_acc=np.mean(score_acc) - mean_auc=np.mean(score_auc) - mean_recall=np.mean(score_recall) - mean_precision=np.mean(score_precision) - mean_f1=np.mean(score_f1) - mean_kappa=np.mean(score_kappa) - mean_mcc=np.mean(score_mcc) - mean_training_time=np.sum(score_training_time) - std_acc=np.std(score_acc) - std_auc=np.std(score_auc) - std_recall=np.std(score_recall) - std_precision=np.std(score_precision) - std_f1=np.std(score_f1) - std_kappa=np.std(score_kappa) - std_mcc=np.std(score_mcc) - std_training_time=np.std(score_training_time) - - avgs_acc = np.append(avgs_acc, mean_acc) - avgs_acc = np.append(avgs_acc, std_acc) - avgs_auc = np.append(avgs_auc, mean_auc) - avgs_auc = np.append(avgs_auc, std_auc) - avgs_recall = np.append(avgs_recall, mean_recall) - avgs_recall = np.append(avgs_recall, std_recall) - avgs_precision = np.append(avgs_precision, mean_precision) - avgs_precision = np.append(avgs_precision, std_precision) - avgs_f1 = np.append(avgs_f1, mean_f1) - avgs_f1 = np.append(avgs_f1, std_f1) - avgs_kappa = np.append(avgs_kappa, mean_kappa) - avgs_kappa = np.append(avgs_kappa, std_kappa) - avgs_mcc = np.append(avgs_mcc, mean_mcc) - avgs_mcc = np.append(avgs_mcc, std_mcc) - avgs_training_time = np.append(avgs_training_time, mean_training_time) - avgs_training_time = np.append(avgs_training_time, std_training_time) - - logger.info("Creating metrics dataframe") - model_results = pd.DataFrame({'Accuracy': score_acc, 'AUC': score_auc, 'Recall' : score_recall, 'Prec.' : score_precision , - 'F1' : score_f1, 'Kappa' : score_kappa,'MCC':score_mcc}) - model_avgs = pd.DataFrame({'Accuracy': avgs_acc, 'AUC': avgs_auc, 'Recall' : avgs_recall, 'Prec.' : avgs_precision , - 'F1' : avgs_f1, 'Kappa' : avgs_kappa,'MCC':avgs_mcc},index=['Mean', 'SD']) - - model_results = model_results.append(model_avgs) - model_results = model_results.round(round) - - # yellow the mean - model_results=model_results.style.apply(lambda x: ['background: yellow' if (x.name == 'Mean') else '' for i in x], axis=1) - model_results = model_results.set_precision(round) - progress.value += 1 - - #appending method and restack param into models_ - models_.append(method) - models_.append(restack) - - #storing results in create_model_container - logger.info("Uploading results into container") - create_model_container.append(model_results.data) - if not finalize: - display_container.append(model_results.data) - - #storing results in master_model_container - logger.info("Uploading model into container") - master_model_container.append(models_) - - ''' - When choose_better sets to True. optimize metric in scoregrid is - compared with base model created using create_model so that stack_models - functions return the model with better score only. This will ensure - model performance is atleast equivalent to what is seen in compare_models - ''' - - scorer = [] - - stack_model_results = create_model_container[-1][compare_dimension][-2:][0] - - scorer.append(stack_model_results) - - if choose_better: - logger.info("choose_better activated") - - if verbose: - if html_param: - monitor.iloc[1,1:] = 'Compiling Final Results' - monitor.iloc[2,1:] = 'Almost Finished' - update_display(monitor, display_id = 'monitor') - - base_models_ = [] - logger.info("SubProcess create_model() called ==================================") - for i in estimator_list: - m = create_model(i,verbose=False, system=False) - s = create_model_container[-1][compare_dimension][-2:][0] - scorer.append(s) - base_models_.append(m) - - #re-instate display_constainer state - display_container.pop(-1) - - meta_model_clone = clone(meta_model) - mm = create_model(meta_model_clone, verbose=False, system=False) - base_models_.append(mm) - s = create_model_container[-1][compare_dimension][-2:][0] - scorer.append(s) - - #re-instate display_constainer state - display_container.pop(-1) - logger.info("SubProcess create_model() end ==================================") - logger.info("choose_better completed") - - #returning better model - index_scorer = scorer.index(max(scorer)) - - if index_scorer == 0: - models_ = models_ - else: - models_ = base_models_[index_scorer-1] - - if plot: - logger.info("Plotting correlation heatmap") - clear_output() - plt.subplots(figsize=(15,7)) - ax = sns.heatmap(base_prediction_cor, vmin=0.2, vmax=1, center=0,cmap='magma', square=True, annot=True, - linewidths=1) - ax.set_ylim(sorted(ax.get_xlim(), reverse=True)) - - #end runtime - runtime_end = time.time() - runtime = np.array(runtime_end - runtime_start).round(2) - - if logging_param and not finalize: - - logger.info("Creating MLFlow logs") - - import mlflow - from pathlib import Path - import os - - #Creating Logs message monitor - monitor.iloc[1,1:] = 'Creating Logs' - monitor.iloc[2,1:] = 'Almost Finished' - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - with mlflow.start_run(run_name='Stacking Classifier') as run: - - # Get active run to log as tag - RunID = mlflow.active_run().info.run_id - - params = meta_model.get_params() - - for i in list(params): - v = params.get(i) - if len(str(v)) > 250: - params.pop(i) - - mlflow.log_params(params) - - mlflow.log_metrics({"Accuracy": avgs_acc[0], "AUC": avgs_auc[0], "Recall": avgs_recall[0], "Precision" : avgs_precision[0], - "F1": avgs_f1[0], "Kappa": avgs_kappa[0], "MCC": avgs_mcc[0]}) - - #set tag of stack_models - mlflow.set_tag("Source", "stack_models") - - import secrets - URI = secrets.token_hex(nbytes=4) - mlflow.set_tag("URI", URI) - mlflow.set_tag("USI", USI) - mlflow.set_tag("Run Time", runtime) - mlflow.set_tag("Run ID", RunID) - - # Log model and transformation pipeline - logger.info("SubProcess save_model() called ==================================") - save_model(models_, 'Trained Model', verbose=False) - logger.info("SubProcess save_model() end ==================================") - mlflow.log_artifact('Trained Model' + '.pkl') - size_bytes = Path('Trained Model.pkl').stat().st_size - size_kb = np.round(size_bytes/1000, 2) - mlflow.set_tag("Size KB", size_kb) - os.remove('Trained Model.pkl') - - # Log training time of compare_models - mlflow.log_metric("TT", model_fit_time) - - # Log the CV results as model_results.html artifact - model_results.data.to_html('Results.html', col_space=65, justify='left') - mlflow.log_artifact('Results.html') - os.remove('Results.html') - - if log_plots_param: - - plt.subplots(figsize=(15,7)) - ax = sns.heatmap(base_prediction_cor, vmin=0.2, vmax=1, center=0,cmap='magma', square=True, annot=True, - linewidths=1) - ax.set_ylim(sorted(ax.get_xlim(), reverse=True)) - plt.savefig("Stacking Heatmap.png") - mlflow.log_artifact('Stacking Heatmap.png') - os.remove('Stacking Heatmap.png') - plt.close() - - # Generate hold-out predictions and save as html - holdout = predict_model(models_, verbose=False) - holdout_score = pull() - display_container.pop(-1) - holdout_score.to_html('Holdout.html', col_space=65, justify='left') - mlflow.log_artifact('Holdout.html') - os.remove('Holdout.html') - - if verbose: - clear_output() - if html_param: - display(model_results) - else: - print(model_results.data) - - logger.info("create_model_container: " + str(len(create_model_container))) - logger.info("master_model_container: " + str(len(master_model_container))) - logger.info("display_container: " + str(len(display_container))) - - logger.info(str(models_)) - logger.info("stack_models() succesfully completed......................................") - - return models_ - -def create_stacknet(estimator_list, - meta_model = None, - fold = 10, - round = 4, - method = 'soft', - restack = True, - choose_better = False, #added in pycaret==2.0.0 - optimize = 'Accuracy', #added in pycaret==2.0.0 - finalize = False, - verbose = True): - - """ - - Description: - ------------ - This function creates a sequential stack net using cross validated predictions - at each layer. The final score grid contains predictions from the meta model - using Stratified Cross Validation. Base level models can be passed as - estimator_list param, the layers can be organized as a sub list within the - estimator_list object. Restacking param controls the ability to expose raw - features to meta model. - - Example: - -------- - from pycaret.datasets import get_data - juice = get_data('juice') - experiment_name = setup(data = juice, target = 'Purchase') - dt = create_model('dt') - rf = create_model('rf') - ada = create_model('ada') - ridge = create_model('ridge') - knn = create_model('knn') - - stacknet = create_stacknet(estimator_list =[[dt,rf],[ada,ridge,knn]]) - - This will result in the stacking of models in multiple layers. The first layer - contains dt and rf, the predictions of which are used by models in the second - layer to generate predictions which are then used by the meta model to generate - final predictions. By default, the meta model is Logistic Regression but can be - changed with meta_model param. - - Parameters - ---------- - estimator_list : nested list of objects - - meta_model : object, default = None - if set to None, Logistic Regression is used as a meta model. - - fold: integer, default = 10 - Number of folds to be used in Kfold CV. Must be at least 2. - - round: integer, default = 4 - Number of decimal places the metrics in the score grid will be rounded to. - - method: string, default = 'soft' - 'soft', uses predicted probabilities as an input to the meta model. - 'hard', uses predicted class labels as an input to the meta model. - - restack: Boolean, default = True - When restack is set to True, raw data and prediction of all layers will be - exposed to the meta model when making predictions. When set to False, only - the predicted label or probabilities of last layer is passed to meta model - when making final predictions. - - choose_better: Boolean, default = False - When set to set to True, base estimator is returned when the metric doesn't - improve by ensemble_model. This gurantees the returned object would perform - atleast equivalent to base estimator created using create_model or model - returned by compare_models. - - optimize: string, default = 'Accuracy' - Only used when choose_better is set to True. optimize parameter is used - to compare emsembled model with base estimator. Values accepted in - optimize parameter are 'Accuracy', 'AUC', 'Recall', 'Precision', 'F1', - 'Kappa' and 'MCC'. - - finalize: Boolean, default = False - When finalize is set to True, it will fit the stacker on entire dataset - including the hold-out sample created during the setup() stage. It is not - recommended to set this to True here, if you would like to fit the stacker - on the entire dataset including the hold-out, use finalize_model(). - - verbose: Boolean, default = True - Score grid is not printed when verbose is set to False. - - Returns: - -------- - - score grid: A table containing the scores of the model across the kfolds. - ----------- Scoring metrics used are Accuracy, AUC, Recall, Precision, F1, - Kappa and MCC. Mean and standard deviation of the scores across the - folds are also returned. - - container: list of all models where the last element is the meta model. - ---------- - - Warnings: - --------- - - When the method is forced to be 'soft' and estimator_list param includes - estimators that donot support the predict_proba method such as 'svm' or - 'ridge', predicted values for those specific estimators only are used - instead of probability when building the meta_model. The same rule applies - when the stacker is used under predict_model() function. - - - If target variable is multiclass (more than 2 classes), AUC will be returned - as zero (0.0) - - - method 'soft' not supported for when target is multiclass. - - - """ - - - - ''' - - ERROR HANDLING STARTS HERE - - ''' - - import logging - - try: - hasattr(logger, 'name') - except: - logger = logging.getLogger('logs') - logger.setLevel(logging.DEBUG) - - # create console handler and set level to debug - if logger.hasHandlers(): - logger.handlers.clear() - - ch = logging.FileHandler('logs.log') - ch.setLevel(logging.DEBUG) - - # create formatter - formatter = logging.Formatter('%(asctime)s:%(levelname)s:%(message)s') - - # add formatter to ch - ch.setFormatter(formatter) - - # add ch to logger - logger.addHandler(ch) - - logger.info("Initializing create_stacknet()") - logger.info("""create_stacknet(estimator_list={}, meta_model={}, fold={}, round={}, method={}, restack={}, choose_better={}, optimize={}, finalize={}, verbose={})""".\ - format(str(estimator_list), str(meta_model), str(fold), str(round), str(method), str(restack), str(choose_better), str(optimize), str(finalize), str(verbose))) - - logger.info("Checking exceptions") - - #exception checking - import sys - - #run_time - import datetime, time - runtime_start = time.time() - - #change method param to 'hard' for multiclass - if y.value_counts().count() > 2: - method = 'hard' - - #checking estimator_list - if type(estimator_list[0]) is not list: - sys.exit("(Type Error): estimator_list parameter must be list of list. ") - - #blocking stack_models usecase - if len(estimator_list) == 1: - sys.exit("(Type Error): Single Layer stacking must be performed using stack_models(). ") - - #checking error for estimator_list - for i in estimator_list: - for j in i: - if 'sklearn' not in str(type(j)) and 'CatBoostClassifier' not in str(type(j)): - sys.exit("(Value Error): estimator_list parameter only trained model object") - - #checking meta model - if meta_model is not None: - if 'sklearn' not in str(type(meta_model)) and 'CatBoostClassifier' not in str(type(meta_model)): - sys.exit("(Value Error): estimator_list parameter only trained model object") - - #stacknet with multiclass - if y.value_counts().count() > 2: - if method == 'soft': - sys.exit("(Type Error): method 'soft' not supported for multiclass problems.") - - #checking fold parameter - if type(fold) is not int: - sys.exit('(Type Error): Fold parameter only accepts integer value.') - - #checking round parameter - if type(round) is not int: - sys.exit('(Type Error): Round parameter only accepts integer value.') - - #checking method parameter - available_method = ['soft', 'hard'] - if method not in available_method: - sys.exit("(Value Error): Method parameter only accepts 'soft' or 'hard' as a parameter. See Docstring for details.") - - #checking restack parameter - if type(restack) is not bool: - sys.exit('(Type Error): Restack parameter can only take argument as True or False.') - - #checking verbose parameter - if type(verbose) is not bool: - sys.exit('(Type Error): Verbose parameter can only take argument as True or False.') - - ''' - - ERROR HANDLING ENDS HERE - - ''' - - logger.info("Preloading libraries") - #pre-load libraries - import pandas as pd - import ipywidgets as ipw - from IPython.display import display, HTML, clear_output, update_display - import time, datetime - from copy import deepcopy - from sklearn.base import clone - - logger.info("Copying estimator list") - #copy estimator_list - estimator_list = deepcopy(estimator_list) - - logger.info("Defining meta model") - #copy meta_model - if meta_model is None: - from sklearn.linear_model import LogisticRegression - meta_model = LogisticRegression() - else: - meta_model = deepcopy(meta_model) - - clear_output() - - import warnings - warnings.filterwarnings('default') - warnings.warn('This function will be deprecated in future release of PyCaret 2.x.') - - if optimize == 'Accuracy': - compare_dimension = 'Accuracy' - elif optimize == 'AUC': - compare_dimension = 'AUC' - elif optimize == 'Recall': - compare_dimension = 'Recall' - elif optimize == 'Precision': - compare_dimension = 'Prec.' - elif optimize == 'F1': - compare_dimension = 'F1' - elif optimize == 'Kappa': - compare_dimension = 'Kappa' - elif optimize == 'MCC': - compare_dimension = 'MCC' - - logger.info("Preparing display monitor") - #progress bar - max_progress = len(estimator_list) + fold + 4 - progress = ipw.IntProgress(value=0, min=0, max=max_progress, step=1 , description='Processing: ') - if verbose: - if html_param: - display(progress) - - #display monitor - timestampStr = datetime.datetime.now().strftime("%H:%M:%S") - monitor = pd.DataFrame( [ ['Initiated' , '. . . . . . . . . . . . . . . . . .', timestampStr ], - ['Status' , '. . . . . . . . . . . . . . . . . .' , 'Loading Dependencies' ], - ['ETC' , '. . . . . . . . . . . . . . . . . .', 'Calculating ETC'] ], - columns=['', ' ', ' ']).set_index('') - - if verbose: - if html_param: - display(monitor, display_id = 'monitor') - - if verbose: - if html_param: - master_display = pd.DataFrame(columns=['Accuracy','AUC','Recall', 'Prec.', 'F1', 'Kappa','MCC']) - display_ = display(master_display, display_id=True) - display_id = display_.display_id - - #models_ list - models_ = [] - - logger.info("Importing libraries") - #general dependencies - import numpy as np - from sklearn import metrics - from sklearn.model_selection import StratifiedKFold - from sklearn.model_selection import cross_val_predict - - progress.value += 1 - - base_level = estimator_list[0] - base_level_names = [] - - logger.info("Defining model names") - #defining base_level_names - for item in base_level: - base_level_names = np.append(base_level_names, str(item).split("(")[0]) - - base_level_fixed = [] - - for i in base_level_names: - if 'CatBoostClassifier' in i: - a = 'CatBoostClassifier' - base_level_fixed.append(a) - else: - base_level_fixed.append(i) - - base_level_fixed_2 = [] - - counter = 0 - for i in base_level_names: - s = str(i) + '_' + 'BaseLevel_' + str(counter) - base_level_fixed_2.append(s) - counter += 1 - - base_level_fixed = base_level_fixed_2 - - inter_level = estimator_list[1:] - inter_level_names = [] - - #defining inter_level names - for item in inter_level: - level_list=[] - for m in item: - if 'CatBoostClassifier' in str(m).split("(")[0]: - level_list.append('CatBoostClassifier') - else: - level_list.append(str(m).split("(")[0]) - inter_level_names.append(level_list) - - logger.info("Copying training dataset") - #defining data_X and data_y - if finalize: - data_X = X.copy() - data_y = y.copy() - else: - data_X = X_train.copy() - data_y = y_train.copy() - - #reset index - data_X.reset_index(drop=True, inplace=True) - data_y.reset_index(drop=True, inplace=True) - - - #Capturing the method of stacking required by user. method='soft' means 'predict_proba' else 'predict' - if method == 'soft': - predict_method = 'predict_proba' - elif method == 'hard': - predict_method = 'predict' - - base_array = np.zeros((0,0)) - base_array_df = pd.DataFrame() - base_prediction = pd.DataFrame(data_y) #change to data_y - base_prediction = base_prediction.reset_index(drop=True) - - base_counter = 0 - - base_models_ = [] - - model_fit_start = time.time() - - for model in base_level: - - logger.info('Checking base model :' + str(base_level_names[base_counter])) - base_models_.append(model.fit(data_X,data_y)) #changed to data_X and data_y - - ''' - MONITOR UPDATE STARTS - ''' - - monitor.iloc[1,1:] = 'Evaluating ' + base_level_names[base_counter] - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - ''' - MONITOR UPDATE ENDS - ''' - - progress.value += 1 - - logger.info("Generating cross val predictions") - if method == 'soft': - try: - base_array = cross_val_predict(model,data_X,data_y,cv=fold, method=predict_method) - base_array = base_array[:,1] - except: - base_array = cross_val_predict(model,data_X,data_y,cv=fold, method='predict') - else: - base_array = cross_val_predict(model,data_X,data_y,cv=fold, method='predict') - - base_array = pd.DataFrame(base_array) - base_array_df = pd.concat([base_array_df, base_array], axis=1) - base_array = np.empty((0,0)) - - base_counter += 1 - - base_array_df.fillna(value=0, inplace=True) #fill na's with zero - base_array_df.columns = base_level_fixed - - if restack: - base_array_df = pd.concat([data_X,base_array_df], axis=1) - - early_break = base_array_df.copy() - - models_.append(base_models_) - - inter_counter = 0 - - for level in inter_level: - - logger.info("Checking intermediate level: " + str(inter_counter)) - - inter_inner = [] - model_counter = 0 - inter_array_df = pd.DataFrame() - - for model in level: - - ''' - MONITOR UPDATE STARTS - ''' - - logger.info("Checking model : " + str(inter_level_names[inter_counter][model_counter])) - - monitor.iloc[1,1:] = 'Evaluating ' + inter_level_names[inter_counter][model_counter] - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - ''' - MONITOR UPDATE ENDS - ''' - - model = clone(model) - inter_inner.append(model.fit(X = base_array_df, y = data_y)) #changed to data_y - - if method == 'soft': - try: - base_array = cross_val_predict(model, X = base_array_df, y = data_y, cv=fold, method=predict_method) - base_array = base_array[:,1] - except: - base_array = cross_val_predict(model, X = base_array_df, y = data_y, cv=fold, method='predict') - - - else: - base_array = cross_val_predict(model, X = base_array_df, y = data_y, cv=fold, method='predict') - - base_array = pd.DataFrame(base_array) - - """ - defining columns - """ - - col = str(model).split("(")[0] - if 'CatBoostClassifier' in col: - col = 'CatBoostClassifier' - col = col + '_InterLevel_' + str(inter_counter) + '_' + str(model_counter) - base_array.columns = [col] - - """ - defining columns end here - """ - - inter_array_df = pd.concat([inter_array_df, base_array], axis=1) - base_array = np.empty((0,0)) - - model_counter += 1 - - base_array_df = pd.concat([base_array_df,inter_array_df], axis=1) - base_array_df.fillna(value=0, inplace=True) #fill na's with zero - - models_.append(inter_inner) - - if restack == False: - i = base_array_df.shape[1] - len(level) - base_array_df = base_array_df.iloc[:,i:] - - inter_counter += 1 - progress.value += 1 - - model = meta_model - - #redefine data_X and data_y - data_X = base_array_df.copy() - - meta_model_ = model.fit(data_X,data_y) - - logger.info("Defining folds") - kf = StratifiedKFold(fold, random_state=seed, shuffle=folds_shuffle_param) #capturing fold requested by user - - logger.info("Declaring metric variables") - score_auc =np.empty((0,0)) - score_acc =np.empty((0,0)) - score_recall =np.empty((0,0)) - score_precision =np.empty((0,0)) - score_f1 =np.empty((0,0)) - score_kappa =np.empty((0,0)) - score_mcc =np.empty((0,0)) - score_training_time =np.empty((0,0)) - avgs_auc =np.empty((0,0)) - avgs_acc =np.empty((0,0)) - avgs_recall =np.empty((0,0)) - avgs_precision =np.empty((0,0)) - avgs_f1 =np.empty((0,0)) - avgs_kappa =np.empty((0,0)) - avgs_mcc =np.empty((0,0)) - avgs_training_time =np.empty((0,0)) - - fold_num = 1 - - for train_i , test_i in kf.split(data_X,data_y): - - logger.info("Initializing fold " + str(fold_num)) - - t0 = time.time() - - ''' - MONITOR UPDATE STARTS - ''' - - monitor.iloc[1,1:] = 'Fitting Meta Model Fold ' + str(fold_num) + ' of ' + str(fold) - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - ''' - MONITOR UPDATE ENDS - ''' - - Xtrain,Xtest = data_X.iloc[train_i], data_X.iloc[test_i] - ytrain,ytest = data_y.iloc[train_i], data_y.iloc[test_i] - - time_start=time.time() - - if fix_imbalance_param: - - logger.info("Initializing SMOTE") - - if fix_imbalance_method_param is None: - from imblearn.over_sampling import SMOTE - resampler = SMOTE(random_state = seed) - else: - resampler = fix_imbalance_method_param - - Xtrain,ytrain = resampler.fit_sample(Xtrain, ytrain) - logger.info("Resampling completed") - - logger.info("Fitting Model") - model.fit(Xtrain,ytrain) - logger.info("Evaluating Metrics") - try: - pred_prob = model.predict_proba(Xtest) - pred_prob = pred_prob[:,1] - except: - pass - pred_ = model.predict(Xtest) - sca = metrics.accuracy_score(ytest,pred_) - try: - sc = metrics.roc_auc_score(ytest,pred_prob) - except: - sc = 0 - - if y.value_counts().count() > 2: - recall = metrics.recall_score(ytest,pred_,average='macro') - precision = metrics.precision_score(ytest,pred_,average='weighted') - f1 = metrics.f1_score(ytest,pred_,average='weighted') - - else: - recall = metrics.recall_score(ytest,pred_) - precision = metrics.precision_score(ytest,pred_) - f1 = metrics.f1_score(ytest,pred_) - - logger.info("Compiling metrics") - time_end=time.time() - kappa = metrics.cohen_kappa_score(ytest,pred_) - mcc = metrics.matthews_corrcoef(ytest,pred_) - training_time=time_end-time_start - score_acc = np.append(score_acc,sca) - score_auc = np.append(score_auc,sc) - score_recall = np.append(score_recall,recall) - score_precision = np.append(score_precision,precision) - score_f1 =np.append(score_f1,f1) - score_kappa =np.append(score_kappa,kappa) - score_mcc =np.append(score_mcc,mcc) - score_training_time =np.append(score_training_time,training_time) - - progress.value += 1 - - ''' - - This section handles time calculation and is created to update_display() as code loops through - the fold defined. - - ''' - - fold_results = pd.DataFrame({'Accuracy':[sca], 'AUC': [sc], 'Recall': [recall], - 'Prec.': [precision], 'F1': [f1], 'Kappa': [kappa],'MCC':[mcc]}).round(round) - - if verbose: - if html_param: - master_display = pd.concat([master_display, fold_results],ignore_index=True) - - fold_results = [] - - ''' - TIME CALCULATION SUB-SECTION STARTS HERE - ''' - t1 = time.time() - - tt = (t1 - t0) * (fold-fold_num) / 60 - tt = np.around(tt, 2) - - if tt < 1: - tt = str(np.around((tt * 60), 2)) - ETC = tt + ' Seconds Remaining' - - else: - tt = str (tt) - ETC = tt + ' Minutes Remaining' - - fold_num += 1 - - ''' - MONITOR UPDATE STARTS - ''' - - monitor.iloc[2,1:] = ETC - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - ''' - MONITOR UPDATE ENDS - ''' - - ''' - TIME CALCULATION ENDS HERE - ''' - - if verbose: - if html_param: - update_display(master_display, display_id = display_id) - - - ''' - - Update_display() ends here - - ''' - - model_fit_end = time.time() - model_fit_time = np.array(model_fit_end - model_fit_start).round(2) - - logger.info("Calculating mean and std") - - mean_acc=np.mean(score_acc) - mean_auc=np.mean(score_auc) - mean_recall=np.mean(score_recall) - mean_precision=np.mean(score_precision) - mean_f1=np.mean(score_f1) - mean_kappa=np.mean(score_kappa) - mean_mcc=np.mean(score_mcc) - mean_training_time=np.sum(score_training_time) - std_acc=np.std(score_acc) - std_auc=np.std(score_auc) - std_recall=np.std(score_recall) - std_precision=np.std(score_precision) - std_f1=np.std(score_f1) - std_kappa=np.std(score_kappa) - std_mcc=np.std(score_mcc) - std_training_time=np.std(score_training_time) - - avgs_acc = np.append(avgs_acc, mean_acc) - avgs_acc = np.append(avgs_acc, std_acc) - avgs_auc = np.append(avgs_auc, mean_auc) - avgs_auc = np.append(avgs_auc, std_auc) - avgs_recall = np.append(avgs_recall, mean_recall) - avgs_recall = np.append(avgs_recall, std_recall) - avgs_precision = np.append(avgs_precision, mean_precision) - avgs_precision = np.append(avgs_precision, std_precision) - avgs_f1 = np.append(avgs_f1, mean_f1) - avgs_f1 = np.append(avgs_f1, std_f1) - avgs_kappa = np.append(avgs_kappa, mean_kappa) - avgs_kappa = np.append(avgs_kappa, std_kappa) - - avgs_mcc = np.append(avgs_mcc, mean_mcc) - avgs_mcc = np.append(avgs_mcc, std_mcc) - avgs_training_time = np.append(avgs_training_time, mean_training_time) - avgs_training_time = np.append(avgs_training_time, std_training_time) - - progress.value += 1 - - logger.info("Creating metrics dataframe") - - model_results = pd.DataFrame({'Accuracy': score_acc, 'AUC': score_auc, 'Recall' : score_recall, 'Prec.' : score_precision, - 'F1' : score_f1, 'Kappa' : score_kappa,'MCC' : score_mcc}) - model_avgs = pd.DataFrame({'Accuracy': avgs_acc, 'AUC': avgs_auc, 'Recall' : avgs_recall, 'Prec.' : avgs_precision , - 'F1' : avgs_f1, 'Kappa' : avgs_kappa,'MCC' : avgs_mcc},index=['Mean', 'SD']) - - model_results = model_results.append(model_avgs) - model_results = model_results.round(round) - - # yellow the mean - model_results=model_results.style.apply(lambda x: ['background: yellow' if (x.name == 'Mean') else '' for i in x], axis=1) - model_results = model_results.set_precision(round) - - progress.value += 1 - - #appending meta_model into models_ - models_.append(meta_model_) - - #appending method into models_ - models_.append([str(method)]) - - #appending restack param - models_.append(restack) - - #storing results in create_model_container - create_model_container.append(model_results.data) - display_container.append(model_results.data) - - #storing results in master_model_container - master_model_container.append(models_) - - ''' - When choose_better sets to True. optimize metric in scoregrid is - compared with base model created using create_model so that stack_models - functions return the model with better score only. This will ensure - model performance is atleast equivalent to what is seen in compare_models - ''' - - scorer = [] - - stack_model_results = create_model_container[-1][compare_dimension][-2:][0] - - scorer.append(stack_model_results) - - if choose_better: - - logger.info("choose_better activated") - - if verbose: - if html_param: - monitor.iloc[1,1:] = 'Compiling Final Results' - monitor.iloc[2,1:] = 'Almost Finished' - update_display(monitor, display_id = 'monitor') - - base_models_ = [] - logger.info("SubProcess create_model() called ==================================") - for i in estimator_list: - for k in i: - m = create_model(k,verbose=False, system=False) - s = create_model_container[-1][compare_dimension][-2:][0] - scorer.append(s) - base_models_.append(m) - - #re-instate display_constainer state - display_container.pop(-1) - - meta_model_clone = clone(meta_model) - mm = create_model(meta_model_clone, verbose=False, system=False) - base_models_.append(mm) - s = create_model_container[-1][compare_dimension][-2:][0] - scorer.append(s) - - #re-instate display_constainer state - display_container.pop(-1) - - logger.info("SubProcess create_model() end ==================================") - - logger.info("choose_better completed") - - #returning better model - index_scorer = scorer.index(max(scorer)) - - if index_scorer == 0: - models_ = models_ - else: - models_ = base_models_[index_scorer-1] - - #end runtime - runtime_end = time.time() - runtime = np.array(runtime_end - runtime_start).round(2) - - if logging_param and not finalize: - - logger.info('Creating MLFlow logs') - - import mlflow - from pathlib import Path - import os - - #Creating Logs message monitor - monitor.iloc[1,1:] = 'Creating Logs' - monitor.iloc[2,1:] = 'Almost Finished' - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - with mlflow.start_run(run_name='Stacking Classifier (Multi-layer)') as run: - - # Get active run to log as tag - RunID = mlflow.active_run().info.run_id - - params = meta_model.get_params() - - for i in list(params): - v = params.get(i) - if len(str(v)) > 250: - params.pop(i) - - mlflow.log_params(params) - - mlflow.log_metrics({"Accuracy": avgs_acc[0], "AUC": avgs_auc[0], "Recall": avgs_recall[0], "Precision" : avgs_precision[0], - "F1": avgs_f1[0], "Kappa": avgs_kappa[0], "MCC": avgs_mcc[0]}) - - #set tag of create_stacknet - mlflow.set_tag("Source", "create_stacknet") - - import secrets - URI = secrets.token_hex(nbytes=4) - mlflow.set_tag("URI", URI) - mlflow.set_tag("USI", USI) - mlflow.set_tag("Run Time", runtime) - mlflow.set_tag("Run ID", RunID) - - # Log model and transformation pipeline - logger.info("SubProcess save_model() called ==================================") - save_model(models_, 'Trained Model', verbose=False) - logger.info("SubProcess save_model() end ==================================") - mlflow.log_artifact('Trained Model' + '.pkl') - size_bytes = Path('Trained Model.pkl').stat().st_size - size_kb = np.round(size_bytes/1000, 2) - mlflow.set_tag("Size KB", size_kb) - os.remove('Trained Model.pkl') - - # Log training time of compare_models - mlflow.log_metric("TT", model_fit_time) - - # Log the CV results as model_results.html artifact - model_results.data.to_html('Results.html', col_space=65, justify='left') - mlflow.log_artifact('Results.html') - os.remove('Results.html') - - # Generate hold-out predictions and save as html - holdout = predict_model(models_, verbose=False) - holdout_score = pull() - display_container.pop(-1) - holdout_score.to_html('Holdout.html', col_space=65, justify='left') - mlflow.log_artifact('Holdout.html') - os.remove('Holdout.html') - - if verbose: - clear_output() - if html_param: - display(model_results) - else: - print(model_results.data) - - logger.info("create_model_container: " + str(len(create_model_container))) - logger.info("master_model_container: " + str(len(master_model_container))) - logger.info("display_container: " + str(len(display_container))) - - logger.info(str(models_)) - logger.info("create_stacknet() succesfully completed......................................") - - return models_ - -def interpret_model(estimator, - plot = 'summary', - feature = None, - observation = None): - - - """ - - Description: - ------------ - This function takes a trained model object and returns an interpretation plot - based on the test / hold-out set. It only supports tree based algorithms. - - This function is implemented based on the SHAP (SHapley Additive exPlanations), - which is a unified approach to explain the output of any machine learning model. - SHAP connects game theory with local explanations. - - For more information : https://shap.readthedocs.io/en/latest/ - - Example - ------- - from pycaret.datasets import get_data - juice = get_data('juice') - experiment_name = setup(data = juice, target = 'Purchase') - dt = create_model('dt') - - interpret_model(dt) - - This will return a summary interpretation plot of Decision Tree model. - - Parameters - ---------- - estimator : object, default = none - A trained tree based model object should be passed as an estimator. - - plot : string, default = 'summary' - other available options are 'correlation' and 'reason'. - - feature: string, default = None - This parameter is only needed when plot = 'correlation'. By default feature is - set to None which means the first column of the dataset will be used as a variable. - A feature parameter must be passed to change this. - - observation: integer, default = None - This parameter only comes into effect when plot is set to 'reason'. If no observation - number is provided, it will return an analysis of all observations with the option - to select the feature on x and y axes through drop down interactivity. For analysis at - the sample level, an observation parameter must be passed with the index value of the - observation in test / hold-out set. - - Returns: - -------- - - Visual Plot: Returns the visual plot. - ----------- Returns the interactive JS plot when plot = 'reason'. - - Warnings: - --------- - - interpret_model doesn't support multiclass problems. - - """ - - - - ''' - Error Checking starts here - - ''' - - import sys - - import logging - - try: - hasattr(logger, 'name') - except: - logger = logging.getLogger('logs') - logger.setLevel(logging.DEBUG) - - # create console handler and set level to debug - if logger.hasHandlers(): - logger.handlers.clear() - - ch = logging.FileHandler('logs.log') - ch.setLevel(logging.DEBUG) - - # create formatter - formatter = logging.Formatter('%(asctime)s:%(levelname)s:%(message)s') - - # add formatter to ch - ch.setFormatter(formatter) - - # add ch to logger - logger.addHandler(ch) - - logger.info("Initializing interpret_model()") - logger.info("""interpret_model(estimator={}, plot={}, feature={}, observation={})""".\ - format(str(estimator), str(plot), str(feature), str(observation))) - - logger.info("Checking exceptions") - - #checking if shap available - try: - import shap - except: - logger.error("shap library not found. pip install shap to use interpret_model function.") - sys.exit("shap library not found. pip install shap to use interpret_model function.") - - #allowed models - allowed_models = ['RandomForestClassifier', - 'DecisionTreeClassifier', - 'ExtraTreesClassifier', - 'GradientBoostingClassifier', - 'XGBClassifier', - 'LGBMClassifier', - 'CatBoostClassifier'] - - model_name = str(estimator).split("(")[0] - - #Statement to find CatBoost and change name : - if model_name.find("catboost.core.CatBoostClassifier") != -1: - model_name = 'CatBoostClassifier' - - if model_name not in allowed_models: - sys.exit('(Type Error): This function only supports tree based models for binary classification.') - - #plot type - allowed_types = ['summary', 'correlation', 'reason'] - if plot not in allowed_types: - sys.exit("(Value Error): type parameter only accepts 'summary', 'correlation' or 'reason'.") - - - ''' - Error Checking Ends here - - ''' - - logger.info("Importing libraries") - #general dependencies - import numpy as np - import pandas as pd - import shap - - #storing estimator in model variable - model = estimator - - #defining type of classifier - type1 = ['RandomForestClassifier','DecisionTreeClassifier','ExtraTreesClassifier', 'LGBMClassifier'] - type2 = ['GradientBoostingClassifier', 'XGBClassifier', 'CatBoostClassifier'] - - if plot == 'summary': - - logger.info("plot type: summary") - - if model_name in type1: - - logger.info("model type detected: type 1") - logger.info("Creating TreeExplainer") - explainer = shap.TreeExplainer(model) - logger.info("Compiling shap values") - shap_values = explainer.shap_values(X_test) - shap.summary_plot(shap_values, X_test) - logger.info("Visual Rendered Successfully") - - elif model_name in type2: - - logger.info("model type detected: type 2") - logger.info("Creating TreeExplainer") - explainer = shap.TreeExplainer(model) - logger.info("Compiling shap values") - shap_values = explainer.shap_values(X_test) - shap.summary_plot(shap_values, X_test) - logger.info("Visual Rendered Successfully") - - elif plot == 'correlation': - - logger.info("plot type: correlation") - - if feature == None: - - logger.warning("No feature passed. Default value of feature used for correlation plot: " + str(X_test.columns[0])) - dependence = X_test.columns[0] - - else: - - logger.warning("feature value passed. Feature used for correlation plot: " + str(X_test.columns[0])) - dependence = feature - - if model_name in type1: - logger.info("model type detected: type 1") - logger.info("Creating TreeExplainer") - explainer = shap.TreeExplainer(model) - logger.info("Compiling shap values") - shap_values = explainer.shap_values(X_test) - shap.dependence_plot(dependence, shap_values[1], X_test) - logger.info("Visual Rendered Successfully") - - elif model_name in type2: - logger.info("model type detected: type 2") - logger.info("Creating TreeExplainer") - explainer = shap.TreeExplainer(model) - logger.info("Compiling shap values") - shap_values = explainer.shap_values(X_test) - shap.dependence_plot(dependence, shap_values, X_test) - logger.info("Visual Rendered Successfully") - - elif plot == 'reason': - - logger.info("plot type: reason") - - if model_name in type1: - logger.info("model type detected: type 1") - - if observation is None: - logger.warning("Observation set to None. Model agnostic plot will be rendered.") - logger.info("Creating TreeExplainer") - explainer = shap.TreeExplainer(model) - logger.info("Compiling shap values") - shap_values = explainer.shap_values(X_test) - shap.initjs() - logger.info("Visual Rendered Successfully") - logger.info("interpret_model() succesfully completed......................................") - return shap.force_plot(explainer.expected_value[1], shap_values[1], X_test) - - else: - - if model_name == 'LGBMClassifier': - logger.info("model type detected: LGBMClassifier") - - row_to_show = observation - data_for_prediction = X_test.iloc[row_to_show] - logger.info("Creating TreeExplainer") - explainer = shap.TreeExplainer(model) - logger.info("Compiling shap values") - shap_values = explainer.shap_values(X_test) - shap.initjs() - logger.info("Visual Rendered Successfully") - logger.info("interpret_model() succesfully completed......................................") - return shap.force_plot(explainer.expected_value[1], shap_values[0][row_to_show], data_for_prediction) - - else: - logger.info("model type detected: Unknown") - row_to_show = observation - data_for_prediction = X_test.iloc[row_to_show] - logger.info("Creating TreeExplainer") - explainer = shap.TreeExplainer(model) - logger.info("Compiling shap values") - shap_values = explainer.shap_values(data_for_prediction) - shap.initjs() - logger.info("Visual Rendered Successfully") - logger.info("interpret_model() succesfully completed......................................") - return shap.force_plot(explainer.expected_value[1], shap_values[1], data_for_prediction) - - - elif model_name in type2: - logger.info("model type detected: type 2") - - if observation is None: - logger.warning("Observation set to None. Model agnostic plot will be rendered.") - logger.info("Creating TreeExplainer") - explainer = shap.TreeExplainer(model) - logger.info("Compiling shap values") - shap_values = explainer.shap_values(X_test) - shap.initjs() - logger.info("Visual Rendered Successfully") - logger.info("interpret_model() succesfully completed......................................") - return shap.force_plot(explainer.expected_value, shap_values, X_test) - - else: - - row_to_show = observation - data_for_prediction = X_test.iloc[row_to_show] - logger.info("Creating TreeExplainer") - explainer = shap.TreeExplainer(model) - logger.info("Compiling shap values") - shap_values = explainer.shap_values(X_test) - shap.initjs() - logger.info("Visual Rendered Successfully") - logger.info("interpret_model() succesfully completed......................................") - return shap.force_plot(explainer.expected_value, shap_values[row_to_show,:], X_test.iloc[row_to_show,:]) - - logger.info("interpret_model() succesfully completed......................................") - -def calibrate_model(estimator, - method = 'sigmoid', - fold=10, - round=4, - verbose=True): - - """ - - Description: - ------------ - This function takes the input of trained estimator and performs probability - calibration with sigmoid or isotonic regression. The output prints a score - grid that shows Accuracy, AUC, Recall, Precision, F1, Kappa and MCC by fold - (default = 10 Fold). The ouput of the original estimator and the calibrated - estimator (created using this function) might not differ much. In order - to see the calibration differences, use 'calibration' plot in plot_model to - see the difference before and after. - - This function returns a trained model object. - - Example - ------- - from pycaret.datasets import get_data - juice = get_data('juice') - experiment_name = setup(data = juice, target = 'Purchase') - dt_boosted = create_model('dt', ensemble = True, method = 'Boosting') - - calibrated_dt = calibrate_model(dt_boosted) - - This will return Calibrated Boosted Decision Tree Model. - - Parameters - ---------- - estimator : object - - method : string, default = 'sigmoid' - The method to use for calibration. Can be 'sigmoid' which corresponds to Platt's - method or 'isotonic' which is a non-parametric approach. It is not advised to use - isotonic calibration with too few calibration samples - - fold: integer, default = 10 - Number of folds to be used in Kfold CV. Must be at least 2. - - round: integer, default = 4 - Number of decimal places the metrics in the score grid will be rounded to. - - verbose: Boolean, default = True - Score grid is not printed when verbose is set to False. - - Returns: - -------- - - score grid: A table containing the scores of the model across the kfolds. - ----------- Scoring metrics used are Accuracy, AUC, Recall, Precision, F1, - Kappa and MCC. Mean and standard deviation of the scores across - the folds are also returned. - - model: trained and calibrated model object. - ----------- - - Warnings: - --------- - - Avoid isotonic calibration with too few calibration samples (<1000) since it - tends to overfit. - - - calibration plot not available for multiclass problems. - - - """ - - - ''' - - ERROR HANDLING STARTS HERE - - ''' - - import logging - - try: - hasattr(logger, 'name') - except: - logger = logging.getLogger('logs') - logger.setLevel(logging.DEBUG) - - # create console handler and set level to debug - if logger.hasHandlers(): - logger.handlers.clear() - - ch = logging.FileHandler('logs.log') - ch.setLevel(logging.DEBUG) - - # create formatter - formatter = logging.Formatter('%(asctime)s:%(levelname)s:%(message)s') - - # add formatter to ch - ch.setFormatter(formatter) - - # add ch to logger - logger.addHandler(ch) - - logger.info("Initializing calibrate_model()") - logger.info("""calibrate_model(estimator={}, method={}, fold={}, round={}, verbose={})""".\ - format(str(estimator), str(method), str(fold), str(round), str(verbose))) - - logger.info("Checking exceptions") - - #exception checking - import sys - - #run_time - import datetime, time - runtime_start = time.time() - - #Statement to find CatBoost and change name - model_name = str(estimator).split("(")[0] - if model_name.find("catboost.core.CatBoostClassifier") != -1: - model_name = 'CatBoostClassifier' - - #catboost not allowed - not_allowed = ['CatBoostClassifier'] - if model_name in not_allowed: - sys.exit('(Type Error): calibrate_model doesnt support CatBoost Classifier. Try different estimator.') - - #checking fold parameter - if type(fold) is not int: - sys.exit('(Type Error): Fold parameter only accepts integer value.') - - #checking round parameter - if type(round) is not int: - sys.exit('(Type Error): Round parameter only accepts integer value.') - - #checking verbose parameter - if type(verbose) is not bool: - sys.exit('(Type Error): Verbose parameter can only take argument as True or False.') - - - ''' - - ERROR HANDLING ENDS HERE - - ''' - - logger.info("Preloading libraries") - - #pre-load libraries - import pandas as pd - import ipywidgets as ipw - from IPython.display import display, HTML, clear_output, update_display - - logger.info("Preparing display monitor") - #progress bar - progress = ipw.IntProgress(value=0, min=0, max=fold+4, step=1 , description='Processing: ') - master_display = pd.DataFrame(columns=['Accuracy','AUC','Recall', 'Prec.', 'F1', 'Kappa','MCC']) - if verbose: - if html_param: - display(progress) - - #display monitor - timestampStr = datetime.datetime.now().strftime("%H:%M:%S") - monitor = pd.DataFrame( [ ['Initiated' , '. . . . . . . . . . . . . . . . . .', timestampStr ], - ['Status' , '. . . . . . . . . . . . . . . . . .' , 'Loading Dependencies' ], - ['ETC' , '. . . . . . . . . . . . . . . . . .', 'Calculating ETC'] ], - columns=['', ' ', ' ']).set_index('') - - if verbose: - if html_param: - display(monitor, display_id = 'monitor') - - if verbose: - if html_param: - display_ = display(master_display, display_id=True) - display_id = display_.display_id - - #ignore warnings - import warnings - warnings.filterwarnings('ignore') - - logger.info("Copying training dataset") - #Storing X_train and y_train in data_X and data_y parameter - data_X = X_train.copy() - data_y = y_train.copy() - - #reset index - data_X.reset_index(drop=True, inplace=True) - data_y.reset_index(drop=True, inplace=True) - - logger.info("Importing libraries") - #general dependencies - import numpy as np - from sklearn import metrics - from sklearn.model_selection import StratifiedKFold - from sklearn.calibration import CalibratedClassifierCV - - progress.value += 1 - - logger.info("Getting model name") - - def get_model_name(e): - return str(e).split("(")[0] - - if len(estimator.classes_) > 2: - - if hasattr(estimator, 'voting'): - mn = get_model_name(estimator) - else: - mn = get_model_name(estimator.estimator) - - else: - if hasattr(estimator, 'voting'): - mn = 'VotingClassifier' - else: - mn = get_model_name(estimator) - - if 'catboost' in mn: - mn = 'CatBoostClassifier' - - model_dict_logging = {'ExtraTreesClassifier' : 'Extra Trees Classifier', - 'GradientBoostingClassifier' : 'Gradient Boosting Classifier', - 'RandomForestClassifier' : 'Random Forest Classifier', - 'LGBMClassifier' : 'Light Gradient Boosting Machine', - 'XGBClassifier' : 'Extreme Gradient Boosting', - 'AdaBoostClassifier' : 'Ada Boost Classifier', - 'DecisionTreeClassifier' : 'Decision Tree Classifier', - 'RidgeClassifier' : 'Ridge Classifier', - 'LogisticRegression' : 'Logistic Regression', - 'KNeighborsClassifier' : 'K Neighbors Classifier', - 'GaussianNB' : 'Naive Bayes', - 'SGDClassifier' : 'SVM - Linear Kernel', - 'SVC' : 'SVM - Radial Kernel', - 'GaussianProcessClassifier' : 'Gaussian Process Classifier', - 'MLPClassifier' : 'MLP Classifier', - 'QuadraticDiscriminantAnalysis' : 'Quadratic Discriminant Analysis', - 'LinearDiscriminantAnalysis' : 'Linear Discriminant Analysis', - 'CatBoostClassifier' : 'CatBoost Classifier', - 'BaggingClassifier' : 'Bagging Classifier', - 'VotingClassifier' : 'Voting Classifier'} - - base_estimator_full_name = model_dict_logging.get(mn) - - logger.info("Base model : " + str(base_estimator_full_name)) - - #cross validation setup starts here - logger.info("Defining folds") - kf = StratifiedKFold(fold, random_state=seed, shuffle=folds_shuffle_param) - - logger.info("Declaring metric variables") - score_auc =np.empty((0,0)) - score_acc =np.empty((0,0)) - score_recall =np.empty((0,0)) - score_precision =np.empty((0,0)) - score_f1 =np.empty((0,0)) - score_kappa =np.empty((0,0)) - score_mcc =np.empty((0,0)) - score_training_time =np.empty((0,0)) - avgs_auc =np.empty((0,0)) - avgs_acc =np.empty((0,0)) - avgs_recall =np.empty((0,0)) - avgs_precision =np.empty((0,0)) - avgs_f1 =np.empty((0,0)) - avgs_kappa =np.empty((0,0)) - avgs_mcc =np.empty((0,0)) - avgs_training_time =np.empty((0,0)) - - ''' - MONITOR UPDATE STARTS - ''' - - monitor.iloc[1,1:] = 'Selecting Estimator' - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - ''' - MONITOR UPDATE ENDS - ''' - - #calibrating estimator - - logger.info("Importing untrained CalibratedClassifierCV") - model = CalibratedClassifierCV(base_estimator=estimator, method=method, cv=fold) - full_name = str(model).split("(")[0] - - progress.value += 1 - - - ''' - MONITOR UPDATE STARTS - ''' - - monitor.iloc[1,1:] = 'Initializing CV' - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - ''' - MONITOR UPDATE ENDS - ''' - - - fold_num = 1 - - for train_i , test_i in kf.split(data_X,data_y): - - logger.info("Initializing Fold " + str(fold_num)) - - t0 = time.time() - - ''' - MONITOR UPDATE STARTS - ''' - - monitor.iloc[1,1:] = 'Fitting Fold ' + str(fold_num) + ' of ' + str(fold) - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - ''' - MONITOR UPDATE ENDS - ''' - - - Xtrain,Xtest = data_X.iloc[train_i], data_X.iloc[test_i] - ytrain,ytest = data_y.iloc[train_i], data_y.iloc[test_i] - time_start=time.time() - - if fix_imbalance_param: - - logger.info("Initializing SMOTE") - - if fix_imbalance_method_param is None: - from imblearn.over_sampling import SMOTE - resampler = SMOTE(random_state = seed) - else: - resampler = fix_imbalance_method_param - - Xtrain,ytrain = resampler.fit_sample(Xtrain, ytrain) - logger.info("Resampling completed") - - if hasattr(model, 'predict_proba'): - - logger.info("Fitting Model") - model.fit(Xtrain,ytrain) - logger.info("Evaluating Metrics") - pred_prob = model.predict_proba(Xtest) - pred_prob = pred_prob[:,1] - pred_ = model.predict(Xtest) - sca = metrics.accuracy_score(ytest,pred_) - - if y.value_counts().count() > 2: - sc = 0 - recall = metrics.recall_score(ytest,pred_, average='macro') - precision = metrics.precision_score(ytest,pred_, average = 'weighted') - f1 = metrics.f1_score(ytest,pred_, average='weighted') - - else: - try: - sc = metrics.roc_auc_score(ytest,pred_prob) - except: - sc = 0 - recall = metrics.recall_score(ytest,pred_) - precision = metrics.precision_score(ytest,pred_) - f1 = metrics.f1_score(ytest,pred_) - - else: - logger.info("Fitting Model") - model.fit(Xtrain,ytrain) - logger.info("Evaluating Metrics") - pred_prob = 0.00 - pred_ = model.predict(Xtest) - sca = metrics.accuracy_score(ytest,pred_) - - if y.value_counts().count() > 2: - sc = 0 - recall = metrics.recall_score(ytest,pred_, average='macro') - precision = metrics.precision_score(ytest,pred_, average = 'weighted') - f1 = metrics.f1_score(ytest,pred_, average='weighted') - - else: - try: - sc = metrics.roc_auc_score(ytest,pred_prob) - except: - sc = 0 - recall = metrics.recall_score(ytest,pred_) - precision = metrics.precision_score(ytest,pred_) - f1 = metrics.f1_score(ytest,pred_) - - logger.info("Compiling Metrics") - time_end=time.time() - kappa = metrics.cohen_kappa_score(ytest,pred_) - mcc = metrics.matthews_corrcoef(ytest,pred_) - training_time=time_end-time_start - score_acc = np.append(score_acc,sca) - score_auc = np.append(score_auc,sc) - score_recall = np.append(score_recall,recall) - score_precision = np.append(score_precision,precision) - score_f1 =np.append(score_f1,f1) - score_kappa =np.append(score_kappa,kappa) - score_mcc =np.append(score_mcc,mcc) - score_training_time =np.append(score_training_time,training_time) - - progress.value += 1 - - - ''' - - This section handles time calculation and is created to update_display() as code loops through - the fold defined. - - ''' - - fold_results = pd.DataFrame({'Accuracy':[sca], 'AUC': [sc], 'Recall': [recall], - 'Prec.': [precision], 'F1': [f1], 'Kappa': [kappa],'MCC':[mcc]}).round(round) - master_display = pd.concat([master_display, fold_results],ignore_index=True) - fold_results = [] - - ''' - TIME CALCULATION SUB-SECTION STARTS HERE - ''' - t1 = time.time() - - tt = (t1 - t0) * (fold-fold_num) / 60 - tt = np.around(tt, 2) - - if tt < 1: - tt = str(np.around((tt * 60), 2)) - ETC = tt + ' Seconds Remaining' - - else: - tt = str (tt) - ETC = tt + ' Minutes Remaining' - - ''' - MONITOR UPDATE STARTS - ''' - - monitor.iloc[2,1:] = ETC - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - ''' - MONITOR UPDATE ENDS - ''' - - fold_num += 1 - - ''' - TIME CALCULATION ENDS HERE - ''' - - if verbose: - if html_param: - update_display(master_display, display_id = display_id) - - - ''' - - Update_display() ends here - - ''' - - logger.info("Calculating mean and std") - mean_acc=np.mean(score_acc) - mean_auc=np.mean(score_auc) - mean_recall=np.mean(score_recall) - mean_precision=np.mean(score_precision) - mean_f1=np.mean(score_f1) - mean_kappa=np.mean(score_kappa) - mean_mcc=np.mean(score_mcc) - mean_training_time=np.sum(score_training_time) - std_acc=np.std(score_acc) - std_auc=np.std(score_auc) - std_recall=np.std(score_recall) - std_precision=np.std(score_precision) - std_f1=np.std(score_f1) - std_kappa=np.std(score_kappa) - std_mcc=np.std(score_mcc) - std_training_time=np.std(score_training_time) - - avgs_acc = np.append(avgs_acc, mean_acc) - avgs_acc = np.append(avgs_acc, std_acc) - avgs_auc = np.append(avgs_auc, mean_auc) - avgs_auc = np.append(avgs_auc, std_auc) - avgs_recall = np.append(avgs_recall, mean_recall) - avgs_recall = np.append(avgs_recall, std_recall) - avgs_precision = np.append(avgs_precision, mean_precision) - avgs_precision = np.append(avgs_precision, std_precision) - avgs_f1 = np.append(avgs_f1, mean_f1) - avgs_f1 = np.append(avgs_f1, std_f1) - avgs_kappa = np.append(avgs_kappa, mean_kappa) - avgs_kappa = np.append(avgs_kappa, std_kappa) - avgs_mcc = np.append(avgs_mcc, mean_mcc) - avgs_mcc = np.append(avgs_mcc, std_mcc) - avgs_training_time = np.append(avgs_training_time, mean_training_time) - avgs_training_time = np.append(avgs_training_time, std_training_time) - - progress.value += 1 - - logger.info("Creating metrics dataframe") - model_results = pd.DataFrame({'Accuracy': score_acc, 'AUC': score_auc, 'Recall' : score_recall, 'Prec.' : score_precision , - 'F1' : score_f1, 'Kappa' : score_kappa,'MCC' : score_mcc}) - model_avgs = pd.DataFrame({'Accuracy': avgs_acc, 'AUC': avgs_auc, 'Recall' : avgs_recall, 'Prec.' : avgs_precision , - 'F1' : avgs_f1, 'Kappa' : avgs_kappa,'MCC' : avgs_mcc},index=['Mean', 'SD']) - - model_results = model_results.append(model_avgs) - model_results = model_results.round(round) - - # yellow the mean - model_results=model_results.style.apply(lambda x: ['background: yellow' if (x.name == 'Mean') else '' for i in x], axis=1) - model_results=model_results.set_precision(round) - - #refitting the model on complete X_train, y_train - monitor.iloc[1,1:] = 'Compiling Final Model' - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - model_fit_start = time.time() - logger.info("Finalizing model") - model.fit(data_X, data_y) - model_fit_end = time.time() - - model_fit_time = np.array(model_fit_end - model_fit_start).round(2) - - progress.value += 1 - - #end runtime - runtime_end = time.time() - runtime = np.array(runtime_end - runtime_start).round(2) - - #storing results in create_model_container - logger.info("Uploading results into container") - create_model_container.append(model_results.data) - display_container.append(model_results.data) - - #storing results in master_model_container - logger.info("Uploading model into container") - master_model_container.append(model) - - #mlflow logging - if logging_param: - - logger.info("Creating MLFlow logs") - - #Creating Logs message monitor - monitor.iloc[1,1:] = 'Creating Logs' - monitor.iloc[2,1:] = 'Almost Finished' - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - #import mlflow - import mlflow - import mlflow.sklearn - from pathlib import Path - import os - - mlflow.set_experiment(exp_name_log) - - with mlflow.start_run(run_name=base_estimator_full_name) as run: - - # Get active run to log as tag - RunID = mlflow.active_run().info.run_id - - # Log model parameters - params = model.get_params() - - for i in list(params): - v = params.get(i) - if len(str(v)) > 250: - params.pop(i) - - mlflow.log_params(params) - - # Log metrics - mlflow.log_metrics({"Accuracy": avgs_acc[0], "AUC": avgs_auc[0], "Recall": avgs_recall[0], "Precision" : avgs_precision[0], - "F1": avgs_f1[0], "Kappa": avgs_kappa[0], "MCC": avgs_mcc[0]}) - - - #set tag of compare_models - mlflow.set_tag("Source", "calibrate_model") - - import secrets - URI = secrets.token_hex(nbytes=4) - mlflow.set_tag("URI", URI) - mlflow.set_tag("USI", USI) - mlflow.set_tag("Run Time", runtime) - mlflow.set_tag("Run ID", RunID) - - # Log training time in seconds - mlflow.log_metric("TT", model_fit_time) - - # Log the CV results as model_results.html artifact - model_results.data.to_html('Results.html', col_space=65, justify='left') - mlflow.log_artifact('Results.html') - os.remove('Results.html') - - # Generate hold-out predictions and save as html - holdout = predict_model(model, verbose=False) - holdout_score = pull() - display_container.pop(-1) - holdout_score.to_html('Holdout.html', col_space=65, justify='left') - mlflow.log_artifact('Holdout.html') - os.remove('Holdout.html') - - # Log AUC and Confusion Matrix plot - if log_plots_param: - - logger.info("SubProcess plot_model() called ==================================") - - try: - plot_model(model, plot = 'auc', verbose=False, save=True, system=False) - mlflow.log_artifact('AUC.png') - os.remove("AUC.png") - except: - pass - - try: - plot_model(model, plot = 'confusion_matrix', verbose=False, save=True, system=False) - mlflow.log_artifact('Confusion Matrix.png') - os.remove("Confusion Matrix.png") - except: - pass - - try: - plot_model(model, plot = 'feature', verbose=False, save=True, system=False) - mlflow.log_artifact('Feature Importance.png') - os.remove("Feature Importance.png") - except: - pass - - logger.info("SubProcess plot_model() end ==================================") - - # Log model and transformation pipeline - logger.info("SubProcess save_model() called ==================================") - save_model(model, 'Trained Model', verbose=False) - logger.info("SubProcess save_model() end ==================================") - mlflow.log_artifact('Trained Model' + '.pkl') - size_bytes = Path('Trained Model.pkl').stat().st_size - size_kb = np.round(size_bytes/1000, 2) - mlflow.set_tag("Size KB", size_kb) - os.remove('Trained Model.pkl') - - if verbose: - clear_output() - if html_param: - display(model_results) - else: - print(model_results.data) - - logger.info("create_model_container: " + str(len(create_model_container))) - logger.info("master_model_container: " + str(len(master_model_container))) - logger.info("display_container: " + str(len(display_container))) - - logger.info(str(model)) - logger.info("calibrate_model() succesfully completed......................................") - - return model - -def evaluate_model(estimator): - - """ - - Description: - ------------ - This function displays a user interface for all of the available plots for - a given estimator. It internally uses the plot_model() function. - - Example: - -------- - from pycaret.datasets import get_data - juice = get_data('juice') - experiment_name = setup(data = juice, target = 'Purchase') - lr = create_model('lr') - - evaluate_model(lr) - - This will display the User Interface for all of the plots for a given - estimator. - - Parameters - ---------- - estimator : object, default = none - A trained model object should be passed as an estimator. - - Returns: - -------- - - User Interface: Displays the user interface for plotting. - -------------- - - - """ - - - from ipywidgets import widgets - from ipywidgets.widgets import interact, fixed, interact_manual - - a = widgets.ToggleButtons( - options=[('Hyperparameters', 'parameter'), - ('AUC', 'auc'), - ('Confusion Matrix', 'confusion_matrix'), - ('Threshold', 'threshold'), - ('Precision Recall', 'pr'), - ('Error', 'error'), - ('Class Report', 'class_report'), - ('Feature Selection', 'rfe'), - ('Learning Curve', 'learning'), - ('Manifold Learning', 'manifold'), - ('Calibration Curve', 'calibration'), - ('Validation Curve', 'vc'), - ('Dimensions', 'dimension'), - ('Feature Importance', 'feature'), - ('Decision Boundary', 'boundary') - ], - - description='Plot Type:', - - disabled=False, - - button_style='', # 'success', 'info', 'warning', 'danger' or '' - - icons=[''] - ) - - - d = interact(plot_model, estimator = fixed(estimator), plot = a, save = fixed(False), verbose = fixed(True), system = fixed(True)) - -def finalize_model(estimator): - - """ - - Description: - ------------ - This function fits the estimator onto the complete dataset passed during the - setup() stage. The purpose of this function is to prepare for final model - deployment after experimentation. - - Example: - -------- - from pycaret.datasets import get_data - juice = get_data('juice') - experiment_name = setup(data = juice, target = 'Purchase') - lr = create_model('lr') - - final_lr = finalize_model(lr) - - This will return the final model object fitted to complete dataset. - - Parameters - ---------- - estimator : object, default = none - A trained model object should be passed as an estimator. - - Returns: - -------- - - Model: Trained model object fitted on complete dataset. - ------ - - Warnings: - --------- - - If the model returned by finalize_model(), is used on predict_model() without - passing a new unseen dataset, then the information grid printed is misleading - as the model is trained on the complete dataset including test / hold-out sample. - Once finalize_model() is used, the model is considered ready for deployment and - should be used on new unseens dataset only. - - - """ - - import logging - - try: - hasattr(logger, 'name') - except: - logger = logging.getLogger('logs') - logger.setLevel(logging.DEBUG) - - # create console handler and set level to debug - if logger.hasHandlers(): - logger.handlers.clear() - - ch = logging.FileHandler('logs.log') - ch.setLevel(logging.DEBUG) - - # create formatter - formatter = logging.Formatter('%(asctime)s:%(levelname)s:%(message)s') - - # add formatter to ch - ch.setFormatter(formatter) - - # add ch to logger - logger.addHandler(ch) - - logger.info("Initializing finalize_model()") - logger.info("""finalize_model(estimator={})""".\ - format(str(estimator))) - - #ignore warnings - import warnings - warnings.filterwarnings('ignore') - - #run_time - import datetime, time - runtime_start = time.time() - - logger.info("Importing libraries") - #import depedencies - from IPython.display import clear_output, update_display - from sklearn.base import clone - from copy import deepcopy - import numpy as np - - logger.info("Getting model name") - - #determine runname for logging - def get_model_name(e): - return str(e).split("(")[0] - - model_dict_logging = {'ExtraTreesClassifier' : 'Extra Trees Classifier', - 'GradientBoostingClassifier' : 'Gradient Boosting Classifier', - 'RandomForestClassifier' : 'Random Forest Classifier', - 'LGBMClassifier' : 'Light Gradient Boosting Machine', - 'XGBClassifier' : 'Extreme Gradient Boosting', - 'AdaBoostClassifier' : 'Ada Boost Classifier', - 'DecisionTreeClassifier' : 'Decision Tree Classifier', - 'RidgeClassifier' : 'Ridge Classifier', - 'LogisticRegression' : 'Logistic Regression', - 'KNeighborsClassifier' : 'K Neighbors Classifier', - 'GaussianNB' : 'Naive Bayes', - 'SGDClassifier' : 'SVM - Linear Kernel', - 'SVC' : 'SVM - Radial Kernel', - 'GaussianProcessClassifier' : 'Gaussian Process Classifier', - 'MLPClassifier' : 'MLP Classifier', - 'QuadraticDiscriminantAnalysis' : 'Quadratic Discriminant Analysis', - 'LinearDiscriminantAnalysis' : 'Linear Discriminant Analysis', - 'CatBoostClassifier' : 'CatBoost Classifier', - 'BaggingClassifier' : 'Bagging Classifier', - 'VotingClassifier' : 'Voting Classifier'} - - if type(estimator) is not list: - - if len(estimator.classes_) > 2: - - if hasattr(estimator, 'voting'): - mn = get_model_name(estimator) - else: - mn = get_model_name(estimator.estimator) - - else: - - if hasattr(estimator, 'voting'): - mn = 'VotingClassifier' - else: - mn = get_model_name(estimator) - - if 'BaggingClassifier' in mn: - mn = get_model_name(estimator.base_estimator_) - - if 'CalibratedClassifierCV' in mn: - mn = get_model_name(estimator.base_estimator) - - if 'catboost' in mn: - mn = 'CatBoostClassifier' - - if type(estimator) is list: - if type(estimator[0]) is not list: - full_name = 'Stacking Classifier' - else: - full_name = 'Stacking Classifier (Multi-layer)' - else: - full_name = model_dict_logging.get(mn) - - if type(estimator) is list: - - if type(estimator[0]) is not list: - - logger.info("Finalizing Stacking Classifier") - - """ - Single Layer Stacker - """ - - stacker_final = deepcopy(estimator) - stack_restack = stacker_final.pop() - stack_method_final = stacker_final.pop() - stack_meta_final = stacker_final.pop() - - logger.info("SubProcess stack_models() called ==================================") - model_final = stack_models(estimator_list = stacker_final, - meta_model = stack_meta_final, - method = stack_method_final, - restack = stack_restack, - finalize=True, - verbose=False) - logger.info("SubProcess stack_models() end ==================================") - - else: - - """ - multiple layer stacknet - """ - - logger.info("Finalizing Multi-layer Stacking Classifier") - - stacker_final = deepcopy(estimator) - stack_restack = stacker_final.pop() - stack_method_final = stacker_final.pop()[0] - stack_meta_final = stacker_final.pop() - - logger.info("SubProcess create_stacknet() called ==================================") - model_final = create_stacknet(estimator_list = stacker_final, - meta_model = stack_meta_final, - method = stack_method_final, - restack = stack_restack, - finalize = True, - verbose = False) - logger.info("SubProcess create_stacknet() called ==================================") - - pull_results = pull() - - else: - - logger.info("Finalizing " + str(full_name)) - model_final = clone(estimator) - clear_output() - model_final.fit(X,y) - - #end runtime - runtime_end = time.time() - runtime = np.array(runtime_end - runtime_start).round(2) - - #mlflow logging - if logging_param: - - logger.info("Creating MLFlow logs") - - #import mlflow - import mlflow - from pathlib import Path - import mlflow.sklearn - import os - - mlflow.set_experiment(exp_name_log) - - with mlflow.start_run(run_name=full_name) as run: - - # Get active run to log as tag - RunID = mlflow.active_run().info.run_id - - # Log model parameters - try: - params = model_final.get_params() - - for i in list(params): - v = params.get(i) - if len(str(v)) > 250: - params.pop(i) - - mlflow.log_params(params) - - except: - pass - - # get metrics of non-finalized model and log it - - try: - logger.info("SubProcess create_model() called ==================================") - c = create_model(estimator, verbose=False, system=False) - logger.info("SubProcess create_model() end ==================================") - cr = pull() - log_accuracy = cr.loc['Mean']['Accuracy'] - log_auc = cr.loc['Mean']['AUC'] - log_recall = cr.loc['Mean']['Recall'] - log_precision = cr.loc['Mean']['Prec.'] - log_f1 = cr.loc['Mean']['F1'] - log_kappa = cr.loc['Mean']['Kappa'] - log_mcc = cr.loc['Mean']['MCC'] - - mlflow.log_metric("Accuracy", log_accuracy) - mlflow.log_metric("AUC", log_auc) - mlflow.log_metric("Recall", log_recall) - mlflow.log_metric("Precision", log_precision) - mlflow.log_metric("F1", log_f1) - mlflow.log_metric("Kappa", log_kappa) - mlflow.log_metric("MCC", log_mcc) - - except: - cr = pull_results - log_accuracy = cr.loc['Mean']['Accuracy'] - log_auc = cr.loc['Mean']['AUC'] - log_recall = cr.loc['Mean']['Recall'] - log_precision = cr.loc['Mean']['Prec.'] - log_f1 = cr.loc['Mean']['F1'] - log_kappa = cr.loc['Mean']['Kappa'] - log_mcc = cr.loc['Mean']['MCC'] - - mlflow.log_metric("Accuracy", log_accuracy) - mlflow.log_metric("AUC", log_auc) - mlflow.log_metric("Recall", log_recall) - mlflow.log_metric("Precision", log_precision) - mlflow.log_metric("F1", log_f1) - mlflow.log_metric("Kappa", log_kappa) - mlflow.log_metric("MCC", log_mcc) - - #set tag of compare_models - mlflow.set_tag("Source", "finalize_model") - - #create MRI (model registration id) - mlflow.set_tag("Final", True) - - import secrets - URI = secrets.token_hex(nbytes=4) - mlflow.set_tag("URI", URI) - mlflow.set_tag("USI", USI) - mlflow.set_tag("Run Time", runtime) - mlflow.set_tag("Run ID", RunID) - - # Log training time in seconds - mlflow.log_metric("TT", runtime) - - # Log AUC and Confusion Matrix plot - if log_plots_param: - - logger.info("SubProcess plot_model() called ==================================") - - try: - plot_model(model_final, plot = 'auc', verbose=False, save=True, system=False) - mlflow.log_artifact('AUC.png') - os.remove("AUC.png") - except: - pass - - try: - plot_model(model_final, plot = 'confusion_matrix', verbose=False, save=True, system=False) - mlflow.log_artifact('Confusion Matrix.png') - os.remove("Confusion Matrix.png") - except: - pass - - try: - plot_model(model_final, plot = 'feature', verbose=False, save=True, system=False) - mlflow.log_artifact('Feature Importance.png') - os.remove("Feature Importance.png") - except: - pass - - logger.info("SubProcess plot_model() end ==================================") - - # Log model and transformation pipeline - logger.info("SubProcess save_model() called ==================================") - save_model(model_final, 'Trained Model', verbose=False) - logger.info("SubProcess save_model() end ==================================") - mlflow.log_artifact('Trained Model' + '.pkl') - size_bytes = Path('Trained Model.pkl').stat().st_size - size_kb = np.round(size_bytes/1000, 2) - mlflow.set_tag("Size KB", size_kb) - os.remove('Trained Model.pkl') - - logger.info("create_model_container: " + str(len(create_model_container))) - logger.info("master_model_container: " + str(len(master_model_container))) - logger.info("display_container: " + str(len(display_container))) - - logger.info(str(model_final)) - logger.info("finalize_model() succesfully completed......................................") - - return model_final - -def save_model(model, model_name, verbose=True): - - """ - - Description: - ------------ - This function saves the transformation pipeline and trained model object - into the current active directory as a pickle file for later use. - - Example: - -------- - from pycaret.datasets import get_data - juice = get_data('juice') - experiment_name = setup(data = juice, target = 'Purchase') - lr = create_model('lr') - - save_model(lr, 'lr_model_23122019') - - This will save the transformation pipeline and model as a binary pickle - file in the current active directory. - - Parameters - ---------- - model : object, default = none - A trained model object should be passed as an estimator. - - model_name : string, default = none - Name of pickle file to be passed as a string. - - verbose: Boolean, default = True - Success message is not printed when verbose is set to False. - - Returns: - -------- - Success Message - - - """ - - import logging - - try: - hasattr(logger, 'name') - except: - logger = logging.getLogger('logs') - logger.setLevel(logging.DEBUG) - - # create console handler and set level to debug - if logger.hasHandlers(): - logger.handlers.clear() - - ch = logging.FileHandler('logs.log') - ch.setLevel(logging.DEBUG) - - # create formatter - formatter = logging.Formatter('%(asctime)s:%(levelname)s:%(message)s') - - # add formatter to ch - ch.setFormatter(formatter) - - # add ch to logger - logger.addHandler(ch) - - logger.info("Initializing save_model()") - logger.info("""save_model(model={}, model_name={}, verbose={})""".\ - format(str(model), str(model_name), str(verbose))) - - - #ignore warnings - import warnings - warnings.filterwarnings('ignore') - - logger.info("Appending prep pipeline") - model_ = [] - model_.append(prep_pipe) - model_.append(model) - - import joblib - model_name = model_name + '.pkl' - joblib.dump(model_, model_name) - if verbose: - print('Transformation Pipeline and Model Succesfully Saved') - - logger.info(str(model_name) + ' saved in current working directory') - logger.info(str(model_)) - logger.info("save_model() succesfully completed......................................") - -def load_model(model_name, - platform = None, - authentication = None, - verbose=True): - - """ - - Description: - ------------ - This function loads a previously saved transformation pipeline and model - from the current active directory into the current python environment. - Load object must be a pickle file. - - Example: - -------- - saved_lr = load_model('lr_model_23122019') - - This will load the previously saved model in saved_lr variable. The file - must be in the current directory. - - Parameters - ---------- - model_name : string, default = none - Name of pickle file to be passed as a string. - - platform: string, default = None - Name of platform, if loading model from cloud. Current available options are: - 'aws'. - - authentication : dict - dictionary of applicable authentication tokens. - - When platform = 'aws': - {'bucket' : 'Name of Bucket on S3'} - - verbose: Boolean, default = True - Success message is not printed when verbose is set to False. - - Returns: - -------- - Success Message - - - """ - - #ignore warnings - import warnings - warnings.filterwarnings('ignore') - - #exception checking - import sys - - if platform is not None: - if authentication is None: - sys.exit("(Value Error): Authentication is missing.") - - #cloud provider - if platform == 'aws': - - import boto3 - bucketname = authentication.get('bucket') - filename = str(model_name) + '.pkl' - s3 = boto3.resource('s3') - s3.Bucket(bucketname).download_file(filename, filename) - filename = str(model_name) - model = load_model(filename, verbose=False) - - if verbose: - print('Transformation Pipeline and Model Sucessfully Loaded') - - return model - - import joblib - model_name = model_name + '.pkl' - if verbose: - print('Transformation Pipeline and Model Sucessfully Loaded') - return joblib.load(model_name) - -def predict_model(estimator, - data=None, - probability_threshold=None, - platform=None, - authentication=None, - verbose=True): #added in pycaret==2.0.0 - - """ - - Description: - ------------ - This function is used to predict new data using a trained estimator. It accepts - an estimator created using one of the function in pycaret that returns a trained - model object or a list of trained model objects created using stack_models() or - create_stacknet(). New unseen data can be passed to data param as pandas Dataframe. - If data is not passed, the test / hold-out set separated at the time of setup() is - used to generate predictions. - - Example: - -------- - from pycaret.datasets import get_data - juice = get_data('juice') - experiment_name = setup(data = juice, target = 'Purchase') - lr = create_model('lr') - - lr_predictions_holdout = predict_model(lr) - - Parameters - ---------- - estimator : object or list of objects / string, default = None - When estimator is passed as string, load_model() is called internally to load the - pickle file from active directory or cloud platform when platform param is passed. - - data : {array-like, sparse matrix}, shape (n_samples, n_features) where n_samples - is the number of samples and n_features is the number of features. All features - used during training must be present in the new dataset. - - probability_threshold : float, default = None - threshold used to convert probability values into binary outcome. By default the - probability threshold for all binary classifiers is 0.5 (50%). This can be changed - using probability_threshold param. - - platform: string, default = None - Name of platform, if loading model from cloud. Current available options are: - 'aws'. - - authentication : dict - dictionary of applicable authentication tokens. - - When platform = 'aws': - {'bucket' : 'Name of Bucket on S3'} - - system: Boolean, default = True - Must remain True all times. Only to be changed by internal functions. - - verbose: Boolean, default = True - Holdout score grid is not printed when verbose is set to False. - - Returns: - -------- - - info grid: Information grid is printed when data is None. - ---------- - - Warnings: - --------- - - if the estimator passed is created using finalize_model() then the metrics - printed in the information grid maybe misleading as the model is trained on - the complete dataset including the test / hold-out set. Once finalize_model() - is used, the model is considered ready for deployment and should be used on new - unseen datasets only. - - - """ - - #testing - #no active test - - #ignore warnings - import warnings - warnings.filterwarnings('ignore') - - #general dependencies - import sys - import numpy as np - import pandas as pd - import re - from sklearn import metrics - from copy import deepcopy - from IPython.display import clear_output, update_display - - """ - exception checking starts here - """ - - model_name = str(estimator).split("(")[0] - if probability_threshold is not None: - if 'OneVsRestClassifier' in model_name: - sys.exit("(Type Error) probability_threshold parameter cannot be used when target is multi-class. ") - - #probability_threshold allowed types - if probability_threshold is not None: - allowed_types = [int,float] - if type(probability_threshold) not in allowed_types: - sys.exit("(Type Error) probability_threshold parameter only accepts value between 0 to 1. ") - - #probability_threshold allowed types - if probability_threshold is not None: - if probability_threshold > 1: - sys.exit("(Type Error) probability_threshold parameter only accepts value between 0 to 1. ") - - #probability_threshold allowed types - if probability_threshold is not None: - if probability_threshold < 0: - sys.exit("(Type Error) probability_threshold parameter only accepts value between 0 to 1. ") - - """ - exception checking ends here - """ - - estimator = deepcopy(estimator) #lookout for an alternate of deepcopy() - - try: - clear_output() - except: - pass - - if type(estimator) is str: - if platform == 'aws': - estimator_ = load_model(str(estimator), platform='aws', - authentication={'bucket': authentication.get('bucket')}, - verbose=False) - - else: - estimator_ = load_model(str(estimator), verbose=False) - - else: - - estimator_ = estimator - - if type(estimator_) is list: - - if 'sklearn.pipeline.Pipeline' in str(type(estimator_[0])): - - prep_pipe_transformer = estimator_.pop(0) - model = estimator_[0] - estimator = estimator_[0] - - else: - - try: - - prep_pipe_transformer = prep_pipe - model = estimator - estimator = estimator - - except: - - sys.exit("(Type Error): Transformation Pipe Missing. ") - - else: - - try: - - prep_pipe_transformer = prep_pipe - model = estimator - estimator = estimator - - except: - - sys.exit("(Type Error): Transformation Pipe Missing. ") - - #dataset - if data is None: - - Xtest = X_test.copy() - ytest = y_test.copy() - X_test_ = X_test.copy() - y_test_ = y_test.copy() - - Xtest.reset_index(drop=True, inplace=True) - ytest.reset_index(drop=True, inplace=True) - X_test_.reset_index(drop=True, inplace=True) - y_test_.reset_index(drop=True, inplace=True) - - model = estimator - estimator_ = estimator - - else: - - Xtest = prep_pipe_transformer.transform(data) - X_test_ = data.copy() #original concater - - Xtest.reset_index(drop=True, inplace=True) - X_test_.reset_index(drop=True, inplace=True) - - estimator_ = estimator - - if type(estimator) is list: - - if type(estimator[0]) is list: - - """ - Multiple Layer Stacking - """ - - #utility - stacker = model - restack = stacker.pop() - stacker_method = stacker.pop() - #stacker_method = stacker_method[0] - stacker_meta = stacker.pop() - stacker_base = stacker.pop(0) - - #base model names - base_model_names = [] - - #defining base_level_names - for i in stacker_base: - b = str(i).split("(")[0] - base_model_names.append(b) - - base_level_fixed = [] - - for i in base_model_names: - if 'CatBoostClassifier' in i: - a = 'CatBoostClassifier' - base_level_fixed.append(a) - else: - base_level_fixed.append(i) - - base_level_fixed_2 = [] - - counter = 0 - for i in base_level_fixed: - s = str(i) + '_' + 'BaseLevel_' + str(counter) - base_level_fixed_2.append(s) - counter += 1 - - base_level_fixed = base_level_fixed_2 - - """ - base level predictions - """ - base_pred = [] - for i in stacker_base: - if 'soft' in stacker_method: - try: - a = i.predict_proba(Xtest) #change - a = a[:,1] - except: - a = i.predict(Xtest) #change - else: - a = i.predict(Xtest) #change - base_pred.append(a) - - base_pred_df = pd.DataFrame() - for i in base_pred: - a = pd.DataFrame(i) - base_pred_df = pd.concat([base_pred_df, a], axis=1) - - base_pred_df.columns = base_level_fixed - - base_pred_df_no_restack = base_pred_df.copy() - base_pred_df = pd.concat([Xtest,base_pred_df], axis=1) - - - """ - inter level predictions - """ - - inter_pred = [] - combined_df = pd.DataFrame(base_pred_df) - - inter_counter = 0 - - for level in stacker: - - inter_pred_df = pd.DataFrame() - - model_counter = 0 - - for model in level: - - try: - if inter_counter == 0: - if 'soft' in stacker_method: #changed - try: - p = model.predict_proba(base_pred_df) - p = p[:,1] - except: - try: - p = model.predict_proba(base_pred_df_no_restack) - p = p[:,1] - except: - try: - p = model.predict(base_pred_df) - except: - p = model.predict(base_pred_df_no_restack) - else: - try: - p = model.predict(base_pred_df) - except: - p = model.predict(base_pred_df_no_restack) - else: - if 'soft' in stacker_method: - try: - p = model.predict_proba(last_level_df) - p = p[:,1] - except: - p = model.predict(last_level_df) - else: - p = model.predict(last_level_df) - except: - if 'soft' in stacker_method: - try: - p = model.predict_proba(combined_df) - p = p[:,1] - except: - p = model.predict(combined_df) - - p = pd.DataFrame(p) - - col = str(model).split("(")[0] - if 'CatBoostClassifier' in col: - col = 'CatBoostClassifier' - col = col + '_InterLevel_' + str(inter_counter) + '_' + str(model_counter) - p.columns = [col] - - inter_pred_df = pd.concat([inter_pred_df, p], axis=1) - - model_counter += 1 - - last_level_df = inter_pred_df.copy() - - inter_counter += 1 - - combined_df = pd.concat([combined_df,inter_pred_df], axis=1) - - """ - meta final predictions - """ - - #final meta predictions - - try: - pred_ = stacker_meta.predict(combined_df) - except: - pred_ = stacker_meta.predict(inter_pred_df) - - try: - pred_prob = stacker_meta.predict_proba(combined_df) - - if len(pred_prob[0]) > 2: - p_counter = 0 - d = [] - for i in range(0,len(pred_prob)): - d.append(pred_prob[i][pred_[p_counter]]) - p_counter += 1 - - pred_prob = d - - else: - pred_prob = pred_prob[:,1] - - except: - try: - pred_prob = stacker_meta.predict_proba(inter_pred_df) - - if len(pred_prob[0]) > 2: - p_counter = 0 - d = [] - for i in range(0,len(pred_prob)): - d.append(pred_prob[i][pred_[p_counter]]) - p_counter += 1 - - pred_prob = d - - else: - pred_prob = pred_prob[:,1] - - except: - pass - - #print('Success') - - if probability_threshold is not None: - try: - pred_ = (pred_prob >= probability_threshold).astype(int) - except: - pass - - if data is None: - sca = metrics.accuracy_score(ytest,pred_) - - try: - sc = metrics.roc_auc_score(ytest,pred_prob,average='weighted') - except: - sc = 0 - - if y.value_counts().count() > 2: - recall = metrics.recall_score(ytest,pred_, average='macro') - precision = metrics.precision_score(ytest,pred_, average = 'weighted') - f1 = metrics.f1_score(ytest,pred_, average='weighted') - - else: - recall = metrics.recall_score(ytest,pred_) - precision = metrics.precision_score(ytest,pred_) - f1 = metrics.f1_score(ytest,pred_) - - - kappa = metrics.cohen_kappa_score(ytest,pred_) - mcc = metrics.matthews_corrcoef(ytest,pred_) - - df_score = pd.DataFrame( {'Model' : 'Stacking Classifier', 'Accuracy' : [sca], 'AUC' : [sc], 'Recall' : [recall], 'Prec.' : [precision], - 'F1' : [f1], 'Kappa' : [kappa], 'MCC':[mcc]}) - df_score = df_score.round(4) - if verbose: - display(df_score) - - label = pd.DataFrame(pred_) - label.columns = ['Label'] - label['Label']=label['Label'].astype(int) - - if data is None: - X_test_ = pd.concat([Xtest,ytest,label], axis=1) - else: - X_test_ = pd.concat([X_test_,label], axis=1) #change here - - if hasattr(stacker_meta,'predict_proba'): - try: - score = pd.DataFrame(pred_prob) - score.columns = ['Score'] - score = score.round(4) - X_test_ = pd.concat([X_test_,score], axis=1) - except: - pass - - else: - - """ - Single Layer Stacking - """ - - #copy - stacker = model - - #restack - restack = stacker.pop() - - #method - method = stacker.pop() - - #separate metamodel - meta_model = stacker.pop() - - model_names = [] - for i in stacker: - model_names = np.append(model_names, str(i).split("(")[0]) - - model_names_fixed = [] - - for i in model_names: - if 'CatBoostClassifier' in i: - a = 'CatBoostClassifier' - model_names_fixed.append(a) - else: - model_names_fixed.append(i) - - model_names = model_names_fixed - - model_names_fixed = [] - counter = 0 - - for i in model_names: - s = str(i) + '_' + str(counter) - model_names_fixed.append(s) - counter += 1 - - model_names = model_names_fixed - - base_pred = [] - - for i in stacker: - if method == 'hard': - #print('done') - p = i.predict(Xtest) #change - - else: - - try: - p = i.predict_proba(Xtest) #change - p = p[:,1] - except: - p = i.predict(Xtest) #change - - base_pred.append(p) - - df = pd.DataFrame() - for i in base_pred: - i = pd.DataFrame(i) - df = pd.concat([df,i], axis=1) - - df.columns = model_names - - df_restack = pd.concat([Xtest,df], axis=1) #change - - #ytest = ytest #change - - #meta predictions starts here - - df.fillna(value=0,inplace=True) - df_restack.fillna(value=0,inplace=True) - - #restacking check - try: - pred_ = meta_model.predict(df) - except: - pred_ = meta_model.predict(df_restack) - - try: - pred_prob = meta_model.predict_proba(df) - - if len(pred_prob[0]) > 2: - p_counter = 0 - d = [] - for i in range(0,len(pred_prob)): - d.append(pred_prob[i][pred_[p_counter]]) - p_counter += 1 - - pred_prob = d - - else: - pred_prob = pred_prob[:,1] - - except: - - try: - pred_prob = meta_model.predict_proba(df_restack) - - if len(pred_prob[0]) > 2: - p_counter = 0 - d = [] - for i in range(0,len(pred_prob)): - d.append(pred_prob[i][pred_[p_counter]]) - p_counter += 1 - - pred_prob = d - - else: - pred_prob = pred_prob[:,1] - except: - pass - - if probability_threshold is not None: - try: - pred_ = (pred_prob >= probability_threshold).astype(int) - except: - pass - - if data is None: - - sca = metrics.accuracy_score(ytest,pred_) - - try: - sc = metrics.roc_auc_score(ytest,pred_prob) - except: - sc = 0 - - if y.value_counts().count() > 2: - recall = metrics.recall_score(ytest,pred_, average='macro') - precision = metrics.precision_score(ytest,pred_, average = 'weighted') - f1 = metrics.f1_score(ytest,pred_, average='weighted') - else: - recall = metrics.recall_score(ytest,pred_) - precision = metrics.precision_score(ytest,pred_) - f1 = metrics.f1_score(ytest,pred_) - - kappa = metrics.cohen_kappa_score(ytest,pred_) - mcc = metrics.matthews_corrcoef(ytest,pred_) - - df_score = pd.DataFrame( {'Model' : 'Stacking Classifier', 'Accuracy' : [sca], 'AUC' : [sc], 'Recall' : [recall], 'Prec.' : [precision], - 'F1' : [f1], 'Kappa' : [kappa], 'MCC':[mcc]}) - df_score = df_score.round(4) - if verbose: - display(df_score) - - label = pd.DataFrame(pred_) - label.columns = ['Label'] - label['Label']=label['Label'].astype(int) - - if data is None: - X_test_ = pd.concat([Xtest,ytest,label], axis=1) #changed - else: - X_test_ = pd.concat([X_test_,label], axis=1) #change here - - if hasattr(meta_model,'predict_proba'): - try: - score = pd.DataFrame(pred_prob) - score.columns = ['Score'] - score = score.round(4) - X_test_ = pd.concat([X_test_,score], axis=1) - except: - pass - - else: - - #model name - full_name = str(model).split("(")[0] - def putSpace(input): - words = re.findall('[A-Z][a-z]*', input) - words = ' '.join(words) - return words - full_name = putSpace(full_name) - - if full_name == 'Gaussian N B': - full_name = 'Naive Bayes' - - elif full_name == 'M L P Classifier': - full_name = 'MLP Classifier' - - elif full_name == 'S G D Classifier': - full_name = 'SVM - Linear Kernel' - - elif full_name == 'S V C': - full_name = 'SVM - Radial Kernel' - - elif full_name == 'X G B Classifier': - full_name = 'Extreme Gradient Boosting' - - elif full_name == 'L G B M Classifier': - full_name = 'Light Gradient Boosting Machine' - - elif 'Cat Boost Classifier' in full_name: - full_name = 'CatBoost Classifier' - - - #prediction starts here - - pred_ = model.predict(Xtest) - - try: - pred_prob = model.predict_proba(Xtest) - - if len(pred_prob[0]) > 2: - p_counter = 0 - d = [] - for i in range(0,len(pred_prob)): - d.append(pred_prob[i][pred_[p_counter]]) - p_counter += 1 - - pred_prob = d - - else: - pred_prob = pred_prob[:,1] - except: - pass - - if probability_threshold is not None: - try: - pred_ = (pred_prob >= probability_threshold).astype(int) - except: - pass - - if data is None: - - sca = metrics.accuracy_score(ytest,pred_) - - try: - sc = metrics.roc_auc_score(ytest,pred_prob) - except: - sc = 0 - - if y.value_counts().count() > 2: - recall = metrics.recall_score(ytest,pred_, average='macro') - precision = metrics.precision_score(ytest,pred_, average = 'weighted') - f1 = metrics.f1_score(ytest,pred_, average='weighted') - else: - recall = metrics.recall_score(ytest,pred_) - precision = metrics.precision_score(ytest,pred_) - f1 = metrics.f1_score(ytest,pred_) - - kappa = metrics.cohen_kappa_score(ytest,pred_) - mcc = metrics.matthews_corrcoef(ytest,pred_) - - df_score = pd.DataFrame( {'Model' : [full_name], 'Accuracy' : [sca], 'AUC' : [sc], 'Recall' : [recall], 'Prec.' : [precision], - 'F1' : [f1], 'Kappa' : [kappa], 'MCC':[mcc]}) - df_score = df_score.round(4) - - if verbose: - display(df_score) - - label = pd.DataFrame(pred_) - label.columns = ['Label'] - label['Label']=label['Label'].astype(int) - - if data is None: - X_test_ = pd.concat([Xtest,ytest,label], axis=1) - else: - X_test_ = pd.concat([X_test_,label], axis=1) - - if hasattr(model,'predict_proba'): - try: - score = pd.DataFrame(pred_prob) - score.columns = ['Score'] - score = score.round(4) - X_test_ = pd.concat([X_test_,score], axis=1) - except: - pass - - #store predictions on hold-out in display_container - try: - display_container.append(df_score) - except: - pass - - return X_test_ - -def deploy_model(model, - model_name, - authentication, - platform = 'aws'): - - """ - - Description: - ------------ - (In Preview) - - This function deploys the transformation pipeline and trained model object for - production use. The platform of deployment can be defined under the platform - param along with the applicable authentication tokens which are passed as a - dictionary to the authentication param. - - Example: - -------- - from pycaret.datasets import get_data - juice = get_data('juice') - experiment_name = setup(data = juice, target = 'Purchase') - lr = create_model('lr') - - deploy_model(model = lr, model_name = 'deploy_lr', platform = 'aws', - authentication = {'bucket' : 'pycaret-test'}) - - This will deploy the model on an AWS S3 account under bucket 'pycaret-test' - - For AWS users: - -------------- - Before deploying a model to an AWS S3 ('aws'), environment variables must be - configured using the command line interface. To configure AWS env. variables, - type aws configure in your python command line. The following information is - required which can be generated using the Identity and Access Management (IAM) - portal of your amazon console account: - - - AWS Access Key ID - - AWS Secret Key Access - - Default Region Name (can be seen under Global settings on your AWS console) - - Default output format (must be left blank) - - Parameters - ---------- - model : object - A trained model object should be passed as an estimator. - - model_name : string - Name of model to be passed as a string. - - authentication : dict - dictionary of applicable authentication tokens. - - When platform = 'aws': - {'bucket' : 'Name of Bucket on S3'} - - platform: string, default = 'aws' - Name of platform for deployment. Current available options are: 'aws'. - - Returns: - -------- - Success Message - - Warnings: - --------- - - This function uses file storage services to deploy the model on cloud platform. - As such, this is efficient for batch-use. Where the production objective is to - obtain prediction at an instance level, this may not be the efficient choice as - it transmits the binary pickle file between your local python environment and - the platform. - - """ - - import sys - import logging - - try: - hasattr(logger, 'name') - except: - logger = logging.getLogger('logs') - logger.setLevel(logging.DEBUG) - - # create console handler and set level to debug - if logger.hasHandlers(): - logger.handlers.clear() - - ch = logging.FileHandler('logs.log') - ch.setLevel(logging.DEBUG) - - # create formatter - formatter = logging.Formatter('%(asctime)s:%(levelname)s:%(message)s') - - # add formatter to ch - ch.setFormatter(formatter) - - # add ch to logger - logger.addHandler(ch) - - logger.info("Initializing deploy_model()") - logger.info("""deploy_model(model={}, model_name={}, authentication={}, platform={})""".\ - format(str(model), str(model_name), str(authentication), str(platform))) - - #checking if awscli available - try: - import awscli - except: - logger.error("awscli library not found. pip install awscli to use deploy_model function.") - sys.exit("awscli library not found. pip install awscli to use deploy_model function.") - - #ignore warnings - import warnings - warnings.filterwarnings('ignore') - - #general dependencies - import ipywidgets as ipw - import pandas as pd - from IPython.display import clear_output, update_display - import os - - if platform == 'aws': - - logger.info("Platform : AWS S3") - - import boto3 - - logger.info("Saving model in active working directory") - logger.info("SubProcess save_model() called ==================================") - save_model(model, model_name = model_name, verbose=False) - logger.info("SubProcess save_model() end ==================================") - - #initiaze s3 - logger.info("Initializing S3 client") - s3 = boto3.client('s3') - filename = str(model_name)+'.pkl' - key = str(model_name)+'.pkl' - bucket_name = authentication.get('bucket') - s3.upload_file(filename,bucket_name,key) - clear_output() - os.remove(filename) - print("Model Succesfully Deployed on AWS S3") - logger.info(str(model)) - logger.info("deploy_model() succesfully completed......................................") - -def optimize_threshold(estimator, - true_positive = 0, - true_negative = 0, - false_positive = 0, - false_negative = 0): - - """ - - Description: - ------------ - This function optimizes probability threshold for a trained model using custom cost - function that can be defined using combination of True Positives, True Negatives, - False Positives (also known as Type I error), and False Negatives (Type II error). - - This function returns a plot of optimized cost as a function of probability - threshold between 0 to 100. - - Example - ------- - from pycaret.datasets import get_data - juice = get_data('juice') - experiment_name = setup(data = juice, target = 'Purchase') - - lr = create_model('lr') - - optimize_threshold(lr, true_negative = 10, false_negative = -100) - - This will return a plot of optimized cost as a function of probability threshold. - - Parameters - ---------- - estimator : object - A trained model object should be passed as an estimator. - - true_positive : int, default = 0 - Cost function or returns when prediction is true positive. - - true_negative : int, default = 0 - Cost function or returns when prediction is true negative. - - false_positive : int, default = 0 - Cost function or returns when prediction is false positive. - - false_negative : int, default = 0 - Cost function or returns when prediction is false negative. - - - Returns: - -------- - - Visual Plot: Prints the visual plot. - ------------ - - Warnings: - --------- - - This function is not supported for multiclass problems. - - - """ - - import logging - - try: - hasattr(logger, 'name') - except: - logger = logging.getLogger('logs') - logger.setLevel(logging.DEBUG) - - # create console handler and set level to debug - if logger.hasHandlers(): - logger.handlers.clear() - - ch = logging.FileHandler('logs.log') - ch.setLevel(logging.DEBUG) - - # create formatter - formatter = logging.Formatter('%(asctime)s:%(levelname)s:%(message)s') - - # add formatter to ch - ch.setFormatter(formatter) - - # add ch to logger - logger.addHandler(ch) - - logger.info("Initializing optimize_threshold()") - logger.info("""optimize_threshold(estimator={}, true_positive={}, true_negative={}, false_positive={}, false_negative={})""".\ - format(str(estimator), str(true_positive), str(true_negative), str(false_positive), str(false_negative))) - - logger.info("Importing libraries") - - #import libraries - import sys - import pandas as pd - import numpy as np - import plotly.express as px - from IPython.display import clear_output - - #cufflinks - import cufflinks as cf - cf.go_offline() - cf.set_config_file(offline=False, world_readable=True) - - - ''' - ERROR HANDLING STARTS HERE - ''' - - logger.info("Checking exceptions") - - #exception 1 for multi-class - if y.value_counts().count() > 2: - sys.exit("(Type Error) optimize_threshold() cannot be used when target is multi-class. ") - - model_name = str(estimator).split("(")[0] - if 'OneVsRestClassifier' in model_name: - sys.exit("(Type Error) optimize_threshold() cannot be used when target is multi-class. ") - - #check predict_proba value - if type(estimator) is not list: - if not hasattr(estimator, 'predict_proba'): - sys.exit("(Type Error) Estimator doesn't support predict_proba function and cannot be used in optimize_threshold(). ") - - #check cost function type - allowed_types = [int, float] - - if type(true_positive) not in allowed_types: - sys.exit("(Type Error) true_positive parameter only accepts float or integer value. ") - - if type(true_negative) not in allowed_types: - sys.exit("(Type Error) true_negative parameter only accepts float or integer value. ") - - if type(false_positive) not in allowed_types: - sys.exit("(Type Error) false_positive parameter only accepts float or integer value. ") - - if type(false_negative) not in allowed_types: - sys.exit("(Type Error) false_negative parameter only accepts float or integer value. ") - - - - ''' - ERROR HANDLING ENDS HERE - ''' - - - #define model as estimator - model = estimator - - model_name = str(model).split("(")[0] - if 'CatBoostClassifier' in model_name: - model_name = 'CatBoostClassifier' - - #generate predictions and store actual on y_test in numpy array - actual = np.array(y_test) - - if type(model) is list: - logger.info("Model Type : Stacking") - predicted = predict_model(model) - model_name = 'Stacking' - clear_output() - try: - predicted = np.array(predicted['Score']) - except: - logger.info("Meta model doesn't support predict_proba function.") - sys.exit("(Type Error) Meta model doesn't support predict_proba function. Cannot be used in optimize_threshold(). ") - - else: - predicted = model.predict_proba(X_test) - predicted = predicted[:,1] - - """ - internal function to calculate loss starts here - """ - - logger.info("Defining loss function") - - def calculate_loss(actual,predicted, - tp_cost=true_positive,tn_cost=true_negative, - fp_cost=false_positive,fn_cost=false_negative): - - #true positives - tp = predicted + actual - tp = np.where(tp==2, 1, 0) - tp = tp.sum() - - #true negative - tn = predicted + actual - tn = np.where(tn==0, 1, 0) - tn = tn.sum() - - #false positive - fp = (predicted > actual).astype(int) - fp = np.where(fp==1, 1, 0) - fp = fp.sum() - - #false negative - fn = (predicted < actual).astype(int) - fn = np.where(fn==1, 1, 0) - fn = fn.sum() - - total_cost = (tp_cost*tp) + (tn_cost*tn) + (fp_cost*fp) + (fn_cost*fn) - - return total_cost - - - """ - internal function to calculate loss ends here - """ - - grid = np.arange(0,1,0.01) - - #loop starts here - - cost = [] - #global optimize_results - - logger.info("Iteration starts at 0") - - for i in grid: - - pred_prob = (predicted >= i).astype(int) - cost.append(calculate_loss(actual,pred_prob)) - - optimize_results = pd.DataFrame({'Probability Threshold' : grid, 'Cost Function' : cost }) - fig = px.line(optimize_results, x='Probability Threshold', y='Cost Function', line_shape='linear') - fig.update_layout(plot_bgcolor='rgb(245,245,245)') - title= str(model_name) + ' Probability Threshold Optimization' - - #calculate vertical line - y0 = optimize_results['Cost Function'].min() - y1 = optimize_results['Cost Function'].max() - x0 = optimize_results.sort_values(by='Cost Function', ascending=False).iloc[0][0] - x1 = x0 - - t = x0.round(2) - - fig.add_shape(dict(type="line", x0=x0, y0=y0, x1=x1, y1=y1,line=dict(color="red",width=2))) - fig.update_layout(title={'text': title, 'y':0.95,'x':0.45,'xanchor': 'center','yanchor': 'top'}) - logger.info("Figure ready for render") - fig.show() - print('Optimized Probability Threshold: ' + str(t) + ' | ' + 'Optimized Cost Function: ' + str(y1)) - logger.info("optimize_threshold() succesfully completed......................................") - -def automl(optimize='Accuracy', use_holdout=False): - - """ - Description: - ------------ - This function returns the best model out of all models created in - current active environment based on metric defined in optimize parameter. - - Parameters - ---------- - optimize : string, default = 'Accuracy' - Other values you can pass in optimize param are 'AUC', 'Recall', 'Precision', - 'F1', 'Kappa', and 'MCC'. - - use_holdout: bool, default = False - When set to True, metrics are evaluated on holdout set instead of CV. - - - """ - - import logging - - try: - hasattr(logger, 'name') - except: - logger = logging.getLogger('logs') - logger.setLevel(logging.DEBUG) - - # create console handler and set level to debug - if logger.hasHandlers(): - logger.handlers.clear() - - ch = logging.FileHandler('logs.log') - ch.setLevel(logging.DEBUG) - - # create formatter - formatter = logging.Formatter('%(asctime)s:%(levelname)s:%(message)s') - - # add formatter to ch - ch.setFormatter(formatter) - - # add ch to logger - logger.addHandler(ch) - - logger.info("Initializing automl()") - logger.info("""automl(optimize={}, use_holdout={})""".\ - format(str(optimize), str(use_holdout))) - - if optimize == 'Accuracy': - compare_dimension = 'Accuracy' - elif optimize == 'AUC': - compare_dimension = 'AUC' - elif optimize == 'Recall': - compare_dimension = 'Recall' - elif optimize == 'Precision': - compare_dimension = 'Prec.' - elif optimize == 'F1': - compare_dimension = 'F1' - elif optimize == 'Kappa': - compare_dimension = 'Kappa' - elif optimize == 'MCC': - compare_dimension = 'MCC' - - scorer = [] - - if use_holdout: - logger.info("Model Selection Basis : Holdout set") - for i in master_model_container: - pred_holdout = predict_model(i, verbose=False) - p = pull() - display_container.pop(-1) - p = p[compare_dimension][0] - scorer.append(p) - - else: - logger.info("Model Selection Basis : CV Results on Training set") - for i in create_model_container: - r = i[compare_dimension][-2:][0] - scorer.append(r) - - #returning better model - index_scorer = scorer.index(max(scorer)) - - automl_result = master_model_container[index_scorer] - - logger.info("SubProcess finalize_model() called ==================================") - automl_finalized = finalize_model(automl_result) - logger.info("SubProcess finalize_model() end ==================================") - - logger.info(str(automl_finalized)) - logger.info("automl() succesfully completed......................................") - - return automl_finalized - -def pull(): - return display_container[-1] - -def models(type=None): - - """ - - Description: - ------------ - Returns table of models available in model library. - - Example - ------- - all_models = models() - - This will return pandas dataframe with all available - models and their metadata. - - Parameters - ---------- - type : string, default = None - - - linear : filters and only return linear models - - tree : filters and only return tree based models - - ensemble : filters and only return ensemble models - - - """ - - import pandas as pd - - model_id = ['lr', 'knn', 'nb', 'dt', 'svm', 'rbfsvm', 'gpc', 'mlp', 'ridge', 'rf', 'qda', 'ada', 'gbc', 'lda', 'et', 'xgboost', 'lightgbm', 'catboost'] - - model_name = ['Logistic Regression', - 'K Neighbors Classifier', - 'Naive Bayes', - 'Decision Tree Classifier', - 'SVM - Linear Kernel', - 'SVM - Radial Kernel', - 'Gaussian Process Classifier', - 'MLP Classifier', - 'Ridge Classifier', - 'Random Forest Classifier', - 'Quadratic Discriminant Analysis', - 'Ada Boost Classifier', - 'Gradient Boosting Classifier', - 'Linear Discriminant Analysis', - 'Extra Trees Classifier', - 'Extreme Gradient Boosting', - 'Light Gradient Boosting Machine', - 'CatBoost Classifier'] - - model_ref = ['sklearn.linear_model.LogisticRegression', - 'sklearn.neighbors.KNeighborsClassifier', - 'sklearn.naive_bayes.GaussianNB', - 'sklearn.tree.DecisionTreeClassifier', - 'sklearn.linear_model.SGDClassifier', - 'sklearn.svm.SVC', - 'sklearn.gaussian_process.GPC', - 'sklearn.neural_network.MLPClassifier', - 'sklearn.linear_model.RidgeClassifier', - 'sklearn.ensemble.RandomForestClassifier', - 'sklearn.discriminant_analysis.QDA', - 'sklearn.ensemble.AdaBoostClassifier', - 'sklearn.ensemble.GradientBoostingClassifier', - 'sklearn.discriminant_analysis.LDA', - 'sklearn.ensemble.ExtraTreesClassifier', - 'xgboost.readthedocs.io', - 'github.com/microsoft/LightGBM', - 'catboost.ai'] - - model_turbo = [True, True, True, True, True, False, False, False, True, - True, True, True, True, True, True, True, True, True] - - df = pd.DataFrame({'ID' : model_id, - 'Name' : model_name, - 'Reference' : model_ref, - 'Turbo' : model_turbo}) - - df.set_index('ID', inplace=True) - - linear_models = ['lr', 'ridge', 'svm'] - tree_models = ['dt'] - ensemble_models = ['rf', 'et', 'gbc', 'xgboost', 'lightgbm', 'catboost', 'ada'] - - if type == 'linear': - df = df[df.index.isin(linear_models)] - if type == 'tree': - df = df[df.index.isin(tree_models)] - if type == 'ensemble': - df = df[df.index.isin(ensemble_models)] - - return df - -def get_logs(experiment_name = None, save = False): - - """ - - Description: - ------------ - Returns a table with experiment logs consisting - run details, parameter, metrics and tags. - - Example - ------- - logs = get_logs() - - This will return pandas dataframe. - - Parameters - ---------- - experiment_name : string, default = None - When set to None current active run is used. - - save : bool, default = False - When set to True, csv file is saved in current directory. - - - """ - - import sys - - if experiment_name is None: - exp_name_log_ = exp_name_log - else: - exp_name_log_ = experiment_name - - import mlflow - from mlflow.tracking import MlflowClient - - client = MlflowClient() - - if client.get_experiment_by_name(exp_name_log_) is None: - sys.exit('No active run found. Check logging parameter in setup or to get logs for inactive run pass experiment_name.') - - exp_id = client.get_experiment_by_name(exp_name_log_).experiment_id - runs = mlflow.search_runs(exp_id) - - if save: - file_name = str(exp_name_log_) + '_logs.csv' - runs.to_csv(file_name, index=False) - - return runs - -def get_config(variable): - - """ - Description: - ------------ - This function is used to access global environment variables. - Following variables can be accessed: - - - X: Transformed dataset (X) - - y: Transformed dataset (y) - - X_train: Transformed train dataset (X) - - X_test: Transformed test/holdout dataset (X) - - y_train: Transformed train dataset (y) - - y_test: Transformed test/holdout dataset (y) - - seed: random state set through session_id - - prep_pipe: Transformation pipeline configured through setup - - folds_shuffle_param: shuffle parameter used in Kfolds - - n_jobs_param: n_jobs parameter used in model training - - html_param: html_param configured through setup - - create_model_container: results grid storage container - - master_model_container: model storage container - - display_container: results display container - - exp_name_log: Name of experiment set through setup - - logging_param: log_experiment param set through setup - - log_plots_param: log_plots param set through setup - - USI: Unique session ID parameter set through setup - - fix_imbalance_param: fix_imbalance param set through setup - - fix_imbalance_method_param: fix_imbalance_method param set through setup - - Example: - -------- - X_train = get_config('X_train') - - This will return X_train transformed dataset. - - - """ - - import logging - - try: - hasattr(logger, 'name') - except: - logger = logging.getLogger('logs') - logger.setLevel(logging.DEBUG) - - # create console handler and set level to debug - if logger.hasHandlers(): - logger.handlers.clear() - - ch = logging.FileHandler('logs.log') - ch.setLevel(logging.DEBUG) - - # create formatter - formatter = logging.Formatter('%(asctime)s:%(levelname)s:%(message)s') - - # add formatter to ch - ch.setFormatter(formatter) - - # add ch to logger - logger.addHandler(ch) - - logger.info("Initializing get_config()") - logger.info("""get_config(variable={})""".\ - format(str(variable))) - - if variable == 'X': - global_var = X - - if variable == 'y': - global_var = y - - if variable == 'X_train': - global_var = X_train - - if variable == 'X_test': - global_var = X_test - - if variable == 'y_train': - global_var = y_train - - if variable == 'y_test': - global_var = y_test - - if variable == 'seed': - global_var = seed - - if variable == 'prep_pipe': - global_var = prep_pipe - - if variable == 'folds_shuffle_param': - global_var = folds_shuffle_param - - if variable == 'n_jobs_param': - global_var = n_jobs_param - - if variable == 'html_param': - global_var = html_param - - if variable == 'create_model_container': - global_var = create_model_container - - if variable == 'master_model_container': - global_var = master_model_container - - if variable == 'display_container': - global_var = display_container - - if variable == 'exp_name_log': - global_var = exp_name_log - - if variable == 'logging_param': - global_var = logging_param - - if variable == 'log_plots_param': - global_var = log_plots_param - - if variable == 'USI': - global_var = USI - - if variable == 'fix_imbalance_param': - global_var = fix_imbalance_param - - if variable == 'fix_imbalance_method_param': - global_var = fix_imbalance_method_param - - logger.info("Global variable: " + str(variable) + ' returned') - logger.info("get_config() succesfully completed......................................") - - return global_var - -def set_config(variable,value): - - """ - Description: - ------------ - This function is used to reset global environment variables. - Following variables can be accessed: - - - X: Transformed dataset (X) - - y: Transformed dataset (y) - - X_train: Transformed train dataset (X) - - X_test: Transformed test/holdout dataset (X) - - y_train: Transformed train dataset (y) - - y_test: Transformed test/holdout dataset (y) - - seed: random state set through session_id - - prep_pipe: Transformation pipeline configured through setup - - folds_shuffle_param: shuffle parameter used in Kfolds - - n_jobs_param: n_jobs parameter used in model training - - html_param: html_param configured through setup - - create_model_container: results grid storage container - - master_model_container: model storage container - - display_container: results display container - - exp_name_log: Name of experiment set through setup - - logging_param: log_experiment param set through setup - - log_plots_param: log_plots param set through setup - - USI: Unique session ID parameter set through setup - - fix_imbalance_param: fix_imbalance param set through setup - - fix_imbalance_method_param: fix_imbalance_method param set through setup - - Example: - -------- - set_config('seed', 123) - - This will set the global seed to '123'. - - - """ - - import logging - - try: - hasattr(logger, 'name') - except: - logger = logging.getLogger('logs') - logger.setLevel(logging.DEBUG) - - # create console handler and set level to debug - if logger.hasHandlers(): - logger.handlers.clear() - - ch = logging.FileHandler('logs.log') - ch.setLevel(logging.DEBUG) - - # create formatter - formatter = logging.Formatter('%(asctime)s:%(levelname)s:%(message)s') - - # add formatter to ch - ch.setFormatter(formatter) - - # add ch to logger - logger.addHandler(ch) - - logger.info("Initializing set_config()") - logger.info("""set_config(variable={}, value={})""".\ - format(str(variable), str(value))) - - if variable == 'X': - global X - X = value - - if variable == 'y': - global y - y = value - - if variable == 'X_train': - global X_train - X_train = value - - if variable == 'X_test': - global X_test - X_test = value - - if variable == 'y_train': - global y_train - y_train = value - - if variable == 'y_test': - global y_test - y_test = value - - if variable == 'seed': - global seed - seed = value - - if variable == 'prep_pipe': - global prep_pipe - prep_pipe = value - - if variable == 'folds_shuffle_param': - global folds_shuffle_param - folds_shuffle_param = value - - if variable == 'n_jobs_param': - global n_jobs_param - n_jobs_param = value - - if variable == 'html_param': - global html_param - html_param = value - - if variable == 'create_model_container': - global create_model_container - create_model_container = value - - if variable == 'master_model_container': - global master_model_container - master_model_container = value - - if variable == 'display_container': - global display_container - display_container = value - - if variable == 'exp_name_log': - global exp_name_log - exp_name_log = value - - if variable == 'logging_param': - global logging_param - logging_param = value - - if variable == 'log_plots_param': - global log_plots_param - log_plots_param = value - - if variable == 'USI': - global USI - USI = value - - if variable == 'fix_imbalance_param': - global fix_imbalance_param - fix_imbalance_param = value - - if variable == 'fix_imbalance_method_param': - global fix_imbalance_method_param - fix_imbalance_method_param = value - - logger.info("Global variable: " + str(variable) + ' updated') - logger.info("set_config() succesfully completed......................................") - -def get_system_logs(): - - """ - Read and print 'logs.log' file from current active directory - """ - - file = open('logs.log', 'r') - lines = file.read().splitlines() - file.close() - - for line in lines: - if not line: - continue - - columns = [col.strip() for col in line.split(':') if col] - print(columns) \ No newline at end of file