From 6ca2a5170a60963c9c942ae52fe18b43e18e5ccd Mon Sep 17 00:00:00 2001 From: PyCaret Date: Fri, 31 Jul 2020 01:26:02 -0400 Subject: [PATCH] Delete anomaly.py --- build/lib/pycaret/anomaly.py | 4209 ---------------------------------- 1 file changed, 4209 deletions(-) delete mode 100644 build/lib/pycaret/anomaly.py diff --git a/build/lib/pycaret/anomaly.py b/build/lib/pycaret/anomaly.py deleted file mode 100644 index 72b315c..0000000 --- a/build/lib/pycaret/anomaly.py +++ /dev/null @@ -1,4209 +0,0 @@ -# Module: Anomaly Detection -# Author: Moez Ali -# License: MIT -# Release: PyCaret 2.0x -# Last modified : 30/07/2020 - -def setup(data, - categorical_features = None, - categorical_imputation = 'constant', - ordinal_features = None, - high_cardinality_features = None, - numeric_features = None, - numeric_imputation = 'mean', - date_features = None, - ignore_features = None, - normalize = False, - normalize_method = 'zscore', - transformation = False, - transformation_method = 'yeo-johnson', - handle_unknown_categorical = True, - unknown_categorical_method = 'least_frequent', - pca = False, - pca_method = 'linear', - pca_components = None, - ignore_low_variance = False, - combine_rare_levels = False, - rare_level_threshold = 0.10, - bin_numeric_features = None, - remove_multicollinearity = False, - multicollinearity_threshold = 0.9, - group_features = None, - group_names = None, - supervised = False, - supervised_target = None, - n_jobs = -1, #added in pycaret==2.0.0 - html = True, #added in pycaret==2.0.0 - session_id = None, - log_experiment = False, #added in pycaret==2.0.0 - experiment_name = None, #added in pycaret==2.0.0 - log_plots = False, #added in pycaret==2.0.0 - log_profile = False, #added in pycaret==2.0.0 - log_data = False, #added in pycaret==2.0.0 - silent=False, #added in pycaret==2.0.0 - verbose=True, - profile = False): - - """ - - Description: - ------------ - This function initializes the environment in pycaret. setup() must called before - executing any other function in pycaret. It takes one mandatory parameter: - dataframe {array-like, sparse matrix}. - - Example - ------- - from pycaret.datasets import get_data - anomaly = get_data('anomaly') - - experiment_name = setup(data = anomaly, normalize = True) - - 'anomaly' is a pandas Dataframe. - - Parameters - ---------- - data : {array-like, sparse matrix}, shape (n_samples, n_features) where n_samples - is the number of samples and n_features is the number of features in dataframe. - - categorical_features: string, default = None - If the inferred data types are not correct, categorical_features can be used to - overwrite the inferred type. If when running setup the type of 'column1' is - inferred as numeric instead of categorical, then this parameter can be used - to overwrite the type by passing categorical_features = ['column1']. - - categorical_imputation: string, default = 'constant' - If missing values are found in categorical features, they will be imputed with - a constant 'not_available' value. The other available option is 'mode' which - imputes the missing value using most frequent value in the training dataset. - - ordinal_features: dictionary, default = None - When the data contains ordinal features, they must be encoded differently using - the ordinal_features param. If the data has a categorical variable with values - of 'low', 'medium', 'high' and it is known that low < medium < high, then it can - be passed as ordinal_features = { 'column_name' : ['low', 'medium', 'high'] }. - The list sequence must be in increasing order from lowest to highest. - - high_cardinality_features: string, default = None - When the data containts features with high cardinality, they can be compressed - into fewer levels by passing them as a list of column names with high cardinality. - Features are compressed using frequency distribution. As such original features - are replaced with the frequency distribution and converted into numeric variable. - - numeric_features: string, default = None - If the inferred data types are not correct, numeric_features can be used to - overwrite the inferred type. If when running setup the type of 'column1' is - inferred as a categorical instead of numeric, then this parameter can be used - to overwrite by passing numeric_features = ['column1']. - - numeric_imputation: string, default = 'mean' - If missing values are found in numeric features, they will be imputed with the - mean value of the feature. The other available option is 'median' which imputes - the value using the median value in the training dataset. - - date_features: string, default = None - If the data has a DateTime column that is not automatically detected when running - setup, this parameter can be used by passing date_features = 'date_column_name'. - It can work with multiple date columns. Date columns are not used in modeling. - Instead, feature extraction is performed and date columns are dropped from the - dataset. If the date column includes a time stamp, features related to time will - also be extracted. - - ignore_features: string, default = None - If any feature should be ignored for modeling, it can be passed to the param - ignore_features. The ID and DateTime columns when inferred, are automatically - set to ignore for modeling. - - normalize: bool, default = False - When set to True, the feature space is transformed using the normalized_method - param. Generally, linear algorithms perform better with normalized data however, - the results may vary and it is advised to run multiple experiments to evaluate - the benefit of normalization. - - normalize_method: string, default = 'zscore' - Defines the method to be used for normalization. By default, normalize method - is set to 'zscore'. The standard zscore is calculated as z = (x - u) / s. The - other available options are: - - 'minmax' : scales and translates each feature individually such that it is in - the range of 0 - 1. - - 'maxabs' : scales and translates each feature individually such that the maximal - absolute value of each feature will be 1.0. It does not shift/center - the data, and thus does not destroy any sparsity. - - 'robust' : scales and translates each feature according to the Interquartile range. - When the dataset contains outliers, robust scaler often gives better - results. - - transformation: bool, default = False - When set to True, a power transformation is applied to make the data more normal / - Gaussian-like. This is useful for modeling issues related to heteroscedasticity or - other situations where normality is desired. The optimal parameter for stabilizing - variance and minimizing skewness is estimated through maximum likelihood. - - transformation_method: string, default = 'yeo-johnson' - Defines the method for transformation. By default, the transformation method is set - to 'yeo-johnson'. The other available option is 'quantile' transformation. Both - the transformation transforms the feature set to follow a Gaussian-like or normal - distribution. Note that the quantile transformer is non-linear and may distort linear - correlations between variables measured at the same scale. - - handle_unknown_categorical: bool, default = True - When set to True, unknown categorical levels in new / unseen data are replaced by - the most or least frequent level as learned in the training data. The method is - defined under the unknown_categorical_method param. - - unknown_categorical_method: string, default = 'least_frequent' - Method used to replace unknown categorical levels in unseen data. Method can be - set to 'least_frequent' or 'most_frequent'. - - pca: bool, default = False - When set to True, dimensionality reduction is applied to project the data into - a lower dimensional space using the method defined in pca_method param. In - supervised learning pca is generally performed when dealing with high feature - space and memory is a constraint. Note that not all datasets can be decomposed - efficiently using a linear PCA technique and that applying PCA may result in loss - of information. As such, it is advised to run multiple experiments with different - pca_methods to evaluate the impact. - - pca_method: string, default = 'linear' - The 'linear' method performs Linear dimensionality reduction using Singular Value - Decomposition. The other available options are: - - kernel : dimensionality reduction through the use of RVF kernel. - - incremental : replacement for 'linear' pca when the dataset to be decomposed is - too large to fit in memory - - pca_components: int/float, default = 0.99 - Number of components to keep. if pca_components is a float, it is treated as a - target percentage for information retention. When pca_components is an integer - it is treated as the number of features to be kept. pca_components must be strictly - less than the original number of features in the dataset. - - ignore_low_variance: bool, default = False - When set to True, all categorical features with statistically insignificant variances - are removed from the dataset. The variance is calculated using the ratio of unique - values to the number of samples, and the ratio of the most common value to the - frequency of the second most common value. - - combine_rare_levels: bool, default = False - When set to True, all levels in categorical features below the threshold defined - in rare_level_threshold param are combined together as a single level. There must be - atleast two levels under the threshold for this to take effect. rare_level_threshold - represents the percentile distribution of level frequency. Generally, this technique - is applied to limit a sparse matrix caused by high numbers of levels in categorical - features. - - rare_level_threshold: float, default = 0.1 - Percentile distribution below which rare categories are combined. Only comes into - effect when combine_rare_levels is set to True. - - bin_numeric_features: list, default = None - When a list of numeric features is passed they are transformed into categorical - features using KMeans, where values in each bin have the same nearest center of a - 1D k-means cluster. The number of clusters are determined based on the 'sturges' - method. It is only optimal for gaussian data and underestimates the number of bins - for large non-gaussian datasets. - - remove_multicollinearity: bool, default = False - When set to True, the variables with inter-correlations higher than the threshold - defined under the multicollinearity_threshold param are dropped. When two features - are highly correlated with each other, the feature with higher average correlation - in the feature space is dropped. - - multicollinearity_threshold: float, default = 0.9 - Threshold used for dropping the correlated features. Only comes into effect when - remove_multicollinearity is set to True. - - group_features: list or list of list, default = None - When a dataset contains features that have related characteristics, the group_features - param can be used for statistical feature extraction. For example, if a dataset has - numeric features that are related with each other (i.e 'Col1', 'Col2', 'Col3'), a list - containing the column names can be passed under group_features to extract statistical - information such as the mean, median, mode and standard deviation. - - group_names: list, default = None - When group_features is passed, a name of the group can be passed into the group_names - param as a list containing strings. The length of a group_names list must equal to the - length of group_features. When the length doesn't match or the name is not passed, new - features are sequentially named such as group_1, group_2 etc. - - supervised: bool, default = False - When set to True, supervised_target column is ignored for transformation. This - param is only for internal use. - - supervised_target: string, default = None - Name of supervised_target column that will be ignored for transformation. Only - applicable when tune_model() function is used. This param is only for internal use. - - n_jobs: int, default = -1 - The number of jobs to run in parallel (for functions that supports parallel - processing) -1 means using all processors. To run all functions on single processor - set n_jobs to None. - - html: bool, default = True - If set to False, prevents runtime display of monitor. This must be set to False - when using environment that doesnt support HTML. - - session_id: int, default = None - If None, a random seed is generated and returned in the Information grid. The - unique number is then distributed as a seed in all functions used during the - experiment. This can be used for later reproducibility of the entire experiment. - - log_experiment: bool, default = True - When set to True, all metrics and parameters are logged on MLFlow server. - - experiment_name: str, default = None - Name of experiment for logging. When set to None, 'clf' is by default used as - alias for the experiment name. - - log_plots: bool, default = False - When set to True, specific plots are logged in MLflow as a png file. By default, - it is set to False. - - log_profile: bool, default = False - When set to True, data profile is also logged on MLflow as a html file. By default, - it is set to False. - - silent: bool, default = False - When set to True, confirmation of data types is not required. All preprocessing will - be performed assuming automatically inferred data types. Not recommended for direct use - except for established pipelines. - - verbose: Boolean, default = True - Information grid is not printed when verbose is set to False. - - profile: bool, default = False - If set to true, a data profile for Exploratory Data Analysis will be displayed - in an interactive HTML report. - - Returns: - -------- - - info grid: Information grid is printed. - ----------- - - environment: This function returns various outputs that are stored in variable - ----------- as tuple. They are used by other functions in pycaret. - - - """ - - #exception checking - import sys - - from pycaret.utils import __version__ - ver = __version__() - - import logging - - # create logger - global logger - - logger = logging.getLogger('logs') - logger.setLevel(logging.DEBUG) - - # create console handler and set level to debug - if logger.hasHandlers(): - logger.handlers.clear() - - ch = logging.FileHandler('logs.log') - ch.setLevel(logging.DEBUG) - - # create formatter - formatter = logging.Formatter('%(asctime)s:%(levelname)s:%(message)s') - - # add formatter to ch - ch.setFormatter(formatter) - - # add ch to logger - logger.addHandler(ch) - - logger.info("PyCaret Anomaly Detection Module") - logger.info('version ' + str(ver)) - logger.info("Initializing setup()") - - #generate USI for mlflow tracking - import secrets - global USI - USI = secrets.token_hex(nbytes=2) - logger.info('USI: ' + str(USI)) - - logger.info("""setup(data={}, categorical_features={}, categorical_imputation={}, ordinal_features={}, high_cardinality_features={}, - numeric_features={}, numeric_imputation={}, date_features={}, ignore_features={}, normalize={}, - normalize_method={}, transformation={}, transformation_method={}, handle_unknown_categorical={}, unknown_categorical_method={}, pca={}, pca_method={}, - pca_components={}, ignore_low_variance={}, combine_rare_levels={}, rare_level_threshold={}, bin_numeric_features={}, - remove_multicollinearity={}, multicollinearity_threshold={}, group_features={}, - group_names={}, supervised={}, supervised_target={}, n_jobs={}, html={}, session_id={}, log_experiment={}, - experiment_name={}, log_plots={}, log_profile={}, log_data={}, silent={}, verbose={}, profile={})""".format(\ - str(data.shape), str(categorical_features), str(categorical_imputation), str(ordinal_features),\ - str(high_cardinality_features), str(numeric_features), str(numeric_imputation), str(date_features), str(ignore_features),\ - str(normalize), str(normalize_method), str(transformation), str(transformation_method), str(handle_unknown_categorical), str(unknown_categorical_method), str(pca),\ - str(pca_method), str(pca_components), str(ignore_low_variance), str(combine_rare_levels), str(rare_level_threshold), str(bin_numeric_features),\ - str(remove_multicollinearity), str(multicollinearity_threshold), str(group_features),str(group_names),str(supervised), str(supervised_target), str(n_jobs), str(html),\ - str(session_id),str(log_experiment), str(experiment_name), str(log_plots),str(log_profile), str(log_data), str(silent), str(verbose), str(profile))) - - #logging environment and libraries - logger.info("Checking environment") - - from platform import python_version, platform, python_build, machine - - try: - logger.info("python_version: " + str(python_version())) - except: - logger.warning("cannot find platform.python_version") - - try: - logger.info("python_build: " + str(python_build())) - except: - logger.warning("cannot find platform.python_build") - - try: - logger.info("machine: " + str(machine())) - except: - logger.warning("cannot find platform.machine") - - try: - logger.info("platform: " + str(platform())) - except: - logger.warning("cannot find platform.platform") - - try: - import psutil - logger.info("Memory: " + str(psutil.virtual_memory())) - logger.info("Physical Core: " + str(psutil.cpu_count(logical=False))) - logger.info("Logical Core: " + str(psutil.cpu_count(logical=True))) - except: - logger.warning("cannot find psutil installation. memory not traceable. Install psutil using pip to enable memory logging. ") - - logger.info("Checking libraries") - - try: - from pandas import __version__ - logger.info("pd==" + str(__version__)) - except: - logger.warning("pandas not found") - - try: - from numpy import __version__ - logger.info("numpy==" + str(__version__)) - except: - logger.warning("numpy not found") - - try: - from pyod import __version__ - logger.info("pyod==" + str(__version__)) - except: - logger.warning("pyod not found") - - try: - from mlflow.version import VERSION - import warnings - warnings.filterwarnings('ignore') - logger.info("mlflow==" + str(VERSION)) - except: - logger.warning("mlflow not found") - - logger.info("Checking Exceptions") - - #run_time - import datetime, time - runtime_start = time.time() - - """ - error handling starts here - """ - - #checking data type - if hasattr(data,'shape') is False: - sys.exit('(Type Error): data passed must be of type pandas.DataFrame') - - #checking session_id - if session_id is not None: - if type(session_id) is not int: - sys.exit('(Type Error): session_id parameter must be an integer.') - - #checking normalize parameter - if type(normalize) is not bool: - sys.exit('(Type Error): normalize parameter only accepts True or False.') - - #checking transformation parameter - if type(transformation) is not bool: - sys.exit('(Type Error): transformation parameter only accepts True or False.') - - #checking categorical imputation - allowed_categorical_imputation = ['constant', 'mode'] - if categorical_imputation not in allowed_categorical_imputation: - sys.exit("(Value Error): categorical_imputation param only accepts 'constant' or 'mode' ") - - #ordinal_features - if ordinal_features is not None: - if type(ordinal_features) is not dict: - sys.exit("(Type Error): ordinal_features must be of type dictionary with column name as key and ordered values as list. ") - - #ordinal features check - if ordinal_features is not None: - data_cols = data.columns - #data_cols = data_cols.drop(target) - ord_keys = ordinal_features.keys() - - for i in ord_keys: - if i not in data_cols: - sys.exit("(Value Error) Column name passed as a key in ordinal_features param doesnt exist. ") - - for k in ord_keys: - if data[k].nunique() != len(ordinal_features.get(k)): - sys.exit("(Value Error) Levels passed in ordinal_features param doesnt match with levels in data. ") - - for i in ord_keys: - value_in_keys = ordinal_features.get(i) - value_in_data = list(data[i].unique().astype(str)) - for j in value_in_keys: - if j not in value_in_data: - text = "Column name '" + str(i) + "' doesnt contain any level named '" + str(j) + "'." - sys.exit(text) - - #high_cardinality_features - if high_cardinality_features is not None: - if type(high_cardinality_features) is not list: - sys.exit("(Type Error): high_cardinality_features param only accepts name of columns as a list. ") - - if high_cardinality_features is not None: - data_cols = data.columns - #data_cols = data_cols.drop(target) - for i in high_cardinality_features: - if i not in data_cols: - sys.exit("(Value Error): Column type forced is either target column or doesn't exist in the dataset.") - - #checking numeric imputation - allowed_numeric_imputation = ['mean', 'median'] - if numeric_imputation not in allowed_numeric_imputation: - sys.exit("(Value Error): numeric_imputation param only accepts 'mean' or 'median' ") - - #checking normalize method - allowed_normalize_method = ['zscore', 'minmax', 'maxabs', 'robust'] - if normalize_method not in allowed_normalize_method: - sys.exit("(Value Error): normalize_method param only accepts 'zscore', 'minxmax', 'maxabs' or 'robust'. ") - - #checking transformation method - allowed_transformation_method = ['yeo-johnson', 'quantile'] - if transformation_method not in allowed_transformation_method: - sys.exit("(Value Error): transformation_method param only accepts 'yeo-johnson' or 'quantile' ") - - #handle unknown categorical - if type(handle_unknown_categorical) is not bool: - sys.exit('(Type Error): handle_unknown_categorical parameter only accepts True or False.') - - #unknown categorical method - unknown_categorical_method_available = ['least_frequent', 'most_frequent'] - - #forced type check - all_cols = list(data.columns) - - #categorical - if categorical_features is not None: - for i in categorical_features: - if i not in all_cols: - sys.exit("(Value Error): Column type forced is either target column or doesn't exist in the dataset.") - #numeric - if numeric_features is not None: - for i in numeric_features: - if i not in all_cols: - sys.exit("(Value Error): Column type forced is either target column or doesn't exist in the dataset.") - - #date features - if date_features is not None: - for i in date_features: - if i not in all_cols: - sys.exit("(Value Error): Column type forced is either target column or doesn't exist in the dataset.") - - #drop features - if ignore_features is not None: - for i in ignore_features: - if i not in all_cols: - sys.exit("(Value Error): Feature ignored is either target column or doesn't exist in the dataset.") - - #check pca - if type(pca) is not bool: - sys.exit('(Type Error): PCA parameter only accepts True or False.') - - #pca method check - allowed_pca_methods = ['linear', 'kernel', 'incremental'] - if pca_method not in allowed_pca_methods: - sys.exit("(Value Error): pca method param only accepts 'linear', 'kernel', or 'incremental'. ") - - #pca components check - if pca is True: - if pca_method is not 'linear': - if pca_components is not None: - if(type(pca_components)) is not int: - sys.exit("(Type Error): pca_components parameter must be integer when pca_method is not 'linear'. ") - - #pca components check 2 - if pca is True: - if pca_method is not 'linear': - if pca_components is not None: - if pca_components > len(data.columns): - sys.exit("(Type Error): pca_components parameter cannot be greater than original features space.") - - #pca components check 3 - if pca is True: - if pca_method is 'linear': - if pca_components is not None: - if type(pca_components) is not float: - if pca_components > len(data.columns): - sys.exit("(Type Error): pca_components parameter cannot be greater than original features space or float between 0 - 1.") - - #check ignore_low_variance - if type(ignore_low_variance) is not bool: - sys.exit('(Type Error): ignore_low_variance parameter only accepts True or False.') - - #check ignore_low_variance - if type(combine_rare_levels) is not bool: - sys.exit('(Type Error): combine_rare_levels parameter only accepts True or False.') - - #check rare_level_threshold - if type(rare_level_threshold) is not float: - sys.exit('(Type Error): rare_level_threshold must be a float between 0 and 1. ') - - #bin numeric features - if bin_numeric_features is not None: - all_cols = list(data.columns) - - for i in bin_numeric_features: - if i not in all_cols: - sys.exit("(Value Error): Column type forced is either target column or doesn't exist in the dataset.") - - #remove_multicollinearity - if type(remove_multicollinearity) is not bool: - sys.exit('(Type Error): remove_multicollinearity parameter only accepts True or False.') - - #multicollinearity_threshold - if type(multicollinearity_threshold) is not float: - sys.exit('(Type Error): multicollinearity_threshold must be a float between 0 and 1. ') - - #group features - if group_features is not None: - if type(group_features) is not list: - sys.exit('(Type Error): group_features must be of type list. ') - - if group_names is not None: - if type(group_names) is not list: - sys.exit('(Type Error): group_names must be of type list. ') - - #silent - if type(silent) is not bool: - sys.exit("(Type Error): silent parameter only accepts True or False. ") - - #html - if type(html) is not bool: - sys.exit('(Type Error): html parameter only accepts True or False.') - - #log_experiment - if type(log_experiment) is not bool: - sys.exit('(Type Error): log_experiment parameter only accepts True or False.') - - #log_plots - if type(log_plots) is not bool: - sys.exit('(Type Error): log_plots parameter only accepts True or False.') - - #log_data - if type(log_data) is not bool: - sys.exit('(Type Error): log_data parameter only accepts True or False.') - - #log_profile - if type(log_profile) is not bool: - sys.exit('(Type Error): log_profile parameter only accepts True or False.') - - - """ - error handling ends here - """ - - logger.info("Preloading libraries") - #pre-load libraries - import pandas as pd - import ipywidgets as ipw - from IPython.display import display, HTML, clear_output, update_display - import datetime, time - import secrets - - #pandas option - pd.set_option('display.max_columns', 500) - pd.set_option('display.max_rows', 500) - - #global html_param - global html_param - - #create html_param - html_param = html - - logger.info("Preparing display monitor") - - #progress bar - max_steps = 4 - - progress = ipw.IntProgress(value=0, min=0, max=max_steps, step=1 , description='Processing: ') - - - timestampStr = datetime.datetime.now().strftime("%H:%M:%S") - monitor = pd.DataFrame( [ ['Initiated' , '. . . . . . . . . . . . . . . . . .', timestampStr ], - ['Status' , '. . . . . . . . . . . . . . . . . .' , 'Loading Dependencies' ] ], - #['Step' , '. . . . . . . . . . . . . . . . . .', 'Step 0 of ' + str(total_steps)] ], - columns=['', ' ', ' ']).set_index('') - - if verbose: - if html_param: - display(progress) - display(monitor, display_id = 'monitor') - - logger.info("Importing libraries") - #general dependencies - import numpy as np - import pandas as pd - import random - - #setting sklearn config to print all parameters including default - import sklearn - sklearn.set_config(print_changed_only=False) - - #define highlight function for function grid to display - def highlight_max(s): - is_max = s == True - return ['background-color: lightgreen' if v else '' for v in is_max] - - #ignore warnings - import warnings - warnings.filterwarnings('ignore') - - logger.info("Declaring global variables") - #defining global variables - global data_, X, seed, prep_pipe, prep_param, experiment__,\ - n_jobs_param, exp_name_log, logging_param, log_plots_param - - logger.info("Copying data for preprocessing") - #copy original data for pandas profiler - data_before_preprocess = data.copy() - - #copying data - data_ = data.copy() - - #data without target - if supervised: - data_without_target = data.copy() - data_without_target.drop(supervised_target, axis=1, inplace=True) - - if supervised: - data_for_preprocess = data_without_target.copy() - else: - data_for_preprocess = data_.copy() - - #generate seed to be used globally - if session_id is None: - seed = random.randint(150,9000) - else: - seed = session_id - - """ - preprocessing starts here - """ - - pd.set_option('display.max_columns', 500) - pd.set_option('display.max_rows', 500) - - monitor.iloc[1,1:] = 'Preparing Data for Modeling' - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - #define parameters for preprocessor - - logger.info("Declaring preprocessing parameters") - - #categorical features - if categorical_features is None: - cat_features_pass = [] - else: - cat_features_pass = categorical_features - - #numeric features - if numeric_features is None: - numeric_features_pass = [] - else: - numeric_features_pass = numeric_features - - #drop features - if ignore_features is None: - ignore_features_pass = [] - else: - ignore_features_pass = ignore_features - - #date features - if date_features is None: - date_features_pass = [] - else: - date_features_pass = date_features - - #categorical imputation strategy - if categorical_imputation == 'constant': - categorical_imputation_pass = 'not_available' - elif categorical_imputation == 'mode': - categorical_imputation_pass = 'most frequent' - - #transformation method strategy - if transformation_method == 'yeo-johnson': - trans_method_pass = 'yj' - elif transformation_method == 'quantile': - trans_method_pass = 'quantile' - - #pass method - if pca_method == 'linear': - pca_method_pass = 'pca_liner' - - elif pca_method == 'kernel': - pca_method_pass = 'pca_kernal' - - elif pca_method == 'incremental': - pca_method_pass = 'incremental' - - elif pca_method == 'pls': - pca_method_pass = 'pls' - - #pca components - if pca is True: - if pca_components is None: - if pca_method == 'linear': - pca_components_pass = 0.99 - else: - pca_components_pass = int((len(data.columns))*0.5) - - else: - pca_components_pass = pca_components - - else: - pca_components_pass = 0.99 - - if bin_numeric_features is None: - apply_binning_pass = False - features_to_bin_pass = [] - - else: - apply_binning_pass = True - features_to_bin_pass = bin_numeric_features - - #group features - #=============# - - #apply grouping - if group_features is not None: - apply_grouping_pass = True - else: - apply_grouping_pass = False - - #group features listing - if apply_grouping_pass is True: - - if type(group_features[0]) is str: - group_features_pass = [] - group_features_pass.append(group_features) - else: - group_features_pass = group_features - - else: - - group_features_pass = [[]] - - #group names - if apply_grouping_pass is True: - - if (group_names is None) or (len(group_names) != len(group_features_pass)): - group_names_pass = list(np.arange(len(group_features_pass))) - group_names_pass = ['group_' + str(i) for i in group_names_pass] - - else: - group_names_pass = group_names - - else: - group_names_pass = [] - - - #unknown categorical - if unknown_categorical_method == 'least_frequent': - unknown_categorical_method_pass = 'least frequent' - elif unknown_categorical_method == 'most_frequent': - unknown_categorical_method_pass = 'most frequent' - - #ordinal_features - if ordinal_features is not None: - apply_ordinal_encoding_pass = True - else: - apply_ordinal_encoding_pass = False - - if apply_ordinal_encoding_pass is True: - ordinal_columns_and_categories_pass = ordinal_features - else: - ordinal_columns_and_categories_pass = {} - - #high cardinality - if apply_ordinal_encoding_pass is True: - ordinal_columns_and_categories_pass = ordinal_features - else: - ordinal_columns_and_categories_pass = {} - - if high_cardinality_features is not None: - apply_cardinality_reduction_pass = True - else: - apply_cardinality_reduction_pass = False - - cardinal_method_pass = 'count' - - if apply_cardinality_reduction_pass: - cardinal_features_pass = high_cardinality_features - else: - cardinal_features_pass = [] - - #display dtypes - if supervised is False: - display_types_pass = True - else: - display_types_pass = False - - if silent: - display_types_pass = False - - logger.info("Importing preprocessing module") - - #import library - from pycaret import preprocess - - logger.info("Creating preprocessing pipeline") - - X = preprocess.Preprocess_Path_Two(train_data = data_for_preprocess, - categorical_features = cat_features_pass, - apply_ordinal_encoding = apply_ordinal_encoding_pass, #new - ordinal_columns_and_categories = ordinal_columns_and_categories_pass, - apply_cardinality_reduction = apply_cardinality_reduction_pass, #latest - cardinal_method = cardinal_method_pass, #latest - cardinal_features = cardinal_features_pass, #latest - numerical_features = numeric_features_pass, - time_features = date_features_pass, - features_todrop = ignore_features_pass, - display_types = display_types_pass, - numeric_imputation_strategy = numeric_imputation, - categorical_imputation_strategy = categorical_imputation_pass, - scale_data = normalize, - scaling_method = normalize_method, - Power_transform_data = transformation, - Power_transform_method = trans_method_pass, - apply_untrained_levels_treatment= handle_unknown_categorical, #new - untrained_levels_treatment_method = unknown_categorical_method_pass, #new - apply_pca = pca, - pca_method = pca_method_pass, #new - pca_variance_retained_or_number_of_components = pca_components_pass, #new - apply_zero_nearZero_variance = ignore_low_variance, #new - club_rare_levels = combine_rare_levels, #new - rara_level_threshold_percentage = rare_level_threshold, #new - apply_binning = apply_binning_pass, #new - features_to_binn = features_to_bin_pass, #new - remove_multicollinearity = remove_multicollinearity, #new - maximum_correlation_between_features = multicollinearity_threshold, #new - apply_grouping = apply_grouping_pass, #new - features_to_group_ListofList = group_features_pass, #new - group_name = group_names_pass, #new - random_state = seed) - - progress.value += 1 - logger.info("Preprocessing pipeline created successfully") - - try: - res_type = ['quit','Quit','exit','EXIT','q','Q','e','E','QUIT','Exit'] - res = preprocess.dtypes.response - if res in res_type: - sys.exit("(Process Exit): setup has been interupted with user command 'quit'. setup must rerun." ) - except: - pass - - - #save prep pipe - prep_pipe = preprocess.pipe - prep_param = preprocess - - logger.info("Creating grid variables") - - #generate values for grid show - missing_values = data_before_preprocess.isna().sum().sum() - if missing_values > 0: - missing_flag = True - else: - missing_flag = False - - if normalize is True: - normalize_grid = normalize_method - else: - normalize_grid = 'None' - - if transformation is True: - transformation_grid = transformation_method - else: - transformation_grid = 'None' - - if pca is True: - pca_method_grid = pca_method - else: - pca_method_grid = 'None' - - if pca is True: - pca_components_grid = pca_components_pass - else: - pca_components_grid = 'None' - - if combine_rare_levels: - rare_level_threshold_grid = rare_level_threshold - else: - rare_level_threshold_grid = 'None' - - if bin_numeric_features is None: - numeric_bin_grid = False - else: - numeric_bin_grid = True - - if ordinal_features is not None: - ordinal_features_grid = True - else: - ordinal_features_grid = False - - if remove_multicollinearity is False: - multicollinearity_threshold_grid = None - else: - multicollinearity_threshold_grid = multicollinearity_threshold - - if group_features is not None: - group_features_grid = True - else: - group_features_grid = False - - if high_cardinality_features is not None: - high_cardinality_features_grid = True - else: - high_cardinality_features_grid = False - - learned_types = preprocess.dtypes.learent_dtypes - #learned_types.drop(target, inplace=True) - - float_type = 0 - cat_type = 0 - - for i in preprocess.dtypes.learent_dtypes: - if 'float' in str(i): - float_type += 1 - elif 'object' in str(i): - cat_type += 1 - elif 'int' in str(i): - float_type += 1 - - - """ - preprocessing ends here - """ - - #reset pandas option - pd.reset_option("display.max_rows") - pd.reset_option("display.max_columns") - - logger.info("Creating global containers") - - #create an empty list for pickling later. - if supervised is False: - experiment__ = [] - else: - try: - experiment__.append('dummy') - experiment__.remove('dummy') - except: - experiment__ = [] - - #create n_jobs_param - n_jobs_param = n_jobs - - #create logging parameter - logging_param = log_experiment - - #create exp_name_log param incase logging is False - exp_name_log = 'no_logging' - - #create an empty log_plots_param - if log_plots: - log_plots_param = True - else: - log_plots_param = False - - progress.value += 1 - - #monitor update - monitor.iloc[1,1:] = 'Compiling Results' - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - ''' - Final display Starts - ''' - - shape = data.shape - shape_transformed = X.shape - - if profile: - if verbose: - print('Setup Succesfully Completed! Loading Profile Now... Please Wait!') - else: - if verbose: - print('Setup Succesfully Completed!') - - functions = pd.DataFrame ( [ ['session_id ', seed ], - ['Original Data ', shape ], - ['Missing Values ', missing_flag], - ['Numeric Features ', str(float_type-1) ], - ['Categorical Features ', str(cat_type) ], - ['Ordinal Features ', ordinal_features_grid], - ['High Cardinality Features ', high_cardinality_features_grid], - ['Transformed Data ', shape_transformed ], - ['Numeric Imputer ', numeric_imputation], - ['Categorical Imputer ', categorical_imputation], - ['Normalize ', normalize ], - ['Normalize Method ', normalize_grid ], - ['Transformation ', transformation ], - ['Transformation Method ', transformation_grid ], - ['PCA ', pca], - ['PCA Method ', pca_method_grid], - ['PCA components ', pca_components_grid], - ['Ignore Low Variance ', ignore_low_variance], - ['Combine Rare Levels ', combine_rare_levels], - ['Rare Level Threshold ', rare_level_threshold_grid], - ['Numeric Binning ', numeric_bin_grid], - ['Remove Multicollinearity ', remove_multicollinearity], - ['Multicollinearity Threshold ', multicollinearity_threshold_grid], - ['Group Features ', group_features_grid], - ], columns = ['Description', 'Value'] ) - - functions_ = functions.style.apply(highlight_max) - - progress.value += 1 - - if verbose: - if html_param: - clear_output() - print('Setup Succesfully Completed!') - display(functions_) - else: - print(functions_.data) - - if profile: - try: - import pandas_profiling - pf = pandas_profiling.ProfileReport(data_before_preprocess) - clear_output() - display(pf) - except: - print('Data Profiler Failed. No output to show, please continue with Modeling.') - - ''' - Final display Ends - ''' - - #log into experiment - if verbose: - experiment__.append(('Anomaly Setup Config', functions)) - experiment__.append(('Orignal Dataset', data_)) - experiment__.append(('Transformed Dataset', X)) - experiment__.append(('Transformation Pipeline', prep_pipe)) - - - #end runtime - runtime_end = time.time() - runtime = np.array(runtime_end - runtime_start).round(2) - - if logging_param: - - logger.info("Logging experiment in MLFlow") - - import mlflow - from pathlib import Path - import os - - if experiment_name is None: - exp_name_ = 'ano-default-name' - else: - exp_name_ = experiment_name - - URI = secrets.token_hex(nbytes=4) - exp_name_log = exp_name_ - - try: - mlflow.create_experiment(exp_name_log) - except: - pass - - #mlflow logging - mlflow.set_experiment(exp_name_log) - - run_name_ = 'Session Initialized ' + str(USI) - with mlflow.start_run(run_name=run_name_) as run: - - # Get active run to log as tag - RunID = mlflow.active_run().info.run_id - - k = functions.copy() - k.set_index('Description',drop=True,inplace=True) - kdict = k.to_dict() - params = kdict.get('Value') - mlflow.log_params(params) - - #set tag of compare_models - mlflow.set_tag("Source", "setup") - - import secrets - URI = secrets.token_hex(nbytes=4) - mlflow.set_tag("URI", URI) - - mlflow.set_tag("USI", USI) - - mlflow.set_tag("Run Time", runtime) - - mlflow.set_tag("Run ID", RunID) - - # Log the transformation pipeline - logger.info("SubProcess save_model() called ==================================") - save_model(prep_pipe, 'Transformation Pipeline', verbose=False) - logger.info("SubProcess save_model() end ==================================") - mlflow.log_artifact('Transformation Pipeline' + '.pkl') - size_bytes = Path('Transformation Pipeline.pkl').stat().st_size - size_kb = np.round(size_bytes/1000, 2) - mlflow.set_tag("Size KB", size_kb) - os.remove('Transformation Pipeline.pkl') - - # Log pandas profile - if log_profile: - import pandas_profiling - pf = pandas_profiling.ProfileReport(data_before_preprocess) - pf.to_file("Data Profile.html") - mlflow.log_artifact("Data Profile.html") - os.remove("Data Profile.html") - clear_output() - display(functions_) - - # Log training and testing set - if log_data: - data_before_preprocess.to_csv('data.csv') - mlflow.log_artifact('data.csv') - os.remove('data.csv') - - # Log input.txt that contains name of columns required in dataset - # to use this pipeline based on USI/URI. - - input_cols = list(data_before_preprocess.columns) - - with open("input.txt", "w") as output: - output.write(str(input_cols)) - - mlflow.log_artifact("input.txt") - os.remove('input.txt') - - logger.info(str(prep_pipe)) - logger.info("setup() succesfully completed......................................") - - return X, data_, seed, prep_pipe, prep_param, experiment__,\ - n_jobs_param, html_param, exp_name_log, logging_param, log_plots_param, USI - -def create_model(model = None, - fraction = 0.05, - verbose = True, - system=True, #added in pycaret==2.0.0 - **kwargs): #added in pycaret==2.0.0 - - """ - - Description: - ------------ - This function creates a model on the dataset passed as a data param during - the setup stage. setup() function must be called before using create_model(). - - This function returns a trained model object. - - Example - ------- - from pycaret.datasets import get_data - anomaly = get_data('anomaly') - experiment_name = setup(data = anomaly, normalize = True) - - knn = create_model('knn') - - This will return trained k-Nearest Neighbors model. - - Parameters - ---------- - model : string / object, default = None - - Enter ID of the models available in model library or pass an untrained model - object consistent with fit / predict API to train and evaluate model. List of - models available in model library: - - ID Model - ------- --------- - 'abod' Angle-base Outlier Detection - 'cluster' Clustering-Based Local Outlier - 'cof' Connectivity-Based Outlier Factor - 'histogram' Histogram-based Outlier Detection - 'knn' k-Nearest Neighbors Detector - 'lof' Local Outlier Factor - 'svm' One-class SVM detector - 'pca' Principal Component Analysis - 'mcd' Minimum Covariance Determinant - 'sod' Subspace Outlier Detection - 'sos' Stochastic Outlier Selection - - fraction: float, default = 0.05 - The percentage / proportion of outliers in the dataset. - - verbose: Boolean, default = True - Status update is not printed when verbose is set to False. - - system: Boolean, default = True - Must remain True all times. Only to be changed by internal functions. - - **kwargs: - Additional keyword arguments to pass to the estimator. - - Returns: - -------- - - model: trained model object - ------ - - - """ - - import logging - - try: - hasattr(logger, 'name') - except: - logger = logging.getLogger('logs') - logger.setLevel(logging.DEBUG) - - # create console handler and set level to debug - if logger.hasHandlers(): - logger.handlers.clear() - - ch = logging.FileHandler('logs.log') - ch.setLevel(logging.DEBUG) - - # create formatter - formatter = logging.Formatter('%(asctime)s:%(levelname)s:%(message)s') - - # add formatter to ch - ch.setFormatter(formatter) - - # add ch to logger - logger.addHandler(ch) - - try: - logger.info("Initializing create_model()") - logger.info("""create_model(model={}, fraction={}, verbose={}, system={})""".\ - format(str(model), str(fraction), str(verbose), str(system))) - - logger.info("Checking exceptions") - - except: - logger = logging.getLogger('logs') - logger.setLevel(logging.DEBUG) - - # create console handler and set level to debug - if logger.hasHandlers(): - logger.handlers.clear() - - ch = logging.FileHandler('logs.log') - ch.setLevel(logging.DEBUG) - - # create formatter - formatter = logging.Formatter('%(asctime)s:%(levelname)s:%(message)s') - - # add formatter to ch - ch.setFormatter(formatter) - - # add ch to logger - logger.addHandler(ch) - - logger.info("Initializing create_model()") - logger.info("Checking exceptions") - - #exception checking - import sys - - #run_time - import datetime, time - runtime_start = time.time() - - #ignore warings - import warnings - warnings.filterwarnings('ignore') - - """ - error handling starts here - """ - - #checking for model parameter - if model is None: - sys.exit('(Value Error): Model parameter Missing. Please see docstring for list of available models.') - - #checking for allowed models - allowed_models = ['abod', 'iforest', 'cluster', 'cof', 'histogram', 'knn', 'lof', 'svm', 'pca', 'mcd', 'sod', 'sos'] - - if model not in allowed_models: - sys.exit('(Value Error): Model Not Available. Please see docstring for list of available models.') - - #checking fraction type: - if fraction <= 0 or fraction >= 1: - sys.exit('(Type Error): Fraction parameter can only take value as float between 0 to 1.') - - #checking verbose parameter - if type(verbose) is not bool: - sys.exit('(Type Error): Verbose parameter can only take argument as True or False.') - - """ - error handling ends here - """ - - logger.info("Preloading libraries") - - #pre-load libraries - import pandas as pd - import numpy as np - import ipywidgets as ipw - from IPython.display import display, HTML, clear_output, update_display - import datetime, time - - """ - monitor starts - """ - - logger.info("Preparing display monitor") - - #progress bar and monitor control - timestampStr = datetime.datetime.now().strftime("%H:%M:%S") - progress = ipw.IntProgress(value=0, min=0, max=4, step=1 , description='Processing: ') - monitor = pd.DataFrame( [ ['Initiated' , '. . . . . . . . . . . . . . . . . .', timestampStr ], - ['Status' , '. . . . . . . . . . . . . . . . . .' , 'Initializing'] ], - columns=['', ' ', ' ']).set_index('') - if verbose: - if html_param: - display(progress) - display(monitor, display_id = 'monitor') - - progress.value += 1 - - """ - monitor ends - """ - - #monitor update - monitor.iloc[1,1:] = 'Importing the Model' - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - progress.value += 1 - - #create model - logger.info("Importing untrained model") - - if model == 'abod': - from pyod.models.abod import ABOD - model = ABOD(contamination=fraction, **kwargs) - full_name = 'Angle-base Outlier Detection' - - elif model == 'cluster': - from pyod.models.cblof import CBLOF - try: - model = CBLOF(contamination=fraction, n_clusters=8, random_state=seed, **kwargs) - model.fit(X) - except: - try: - model = CBLOF(contamination=fraction, n_clusters=12, random_state=seed, **kwargs) - model.fit(X) - except: - sys.exit("(Type Error) Could not form valid cluster separation") - - full_name = 'Clustering-Based Local Outlier' - - elif model == 'cof': - from pyod.models.cof import COF - model = COF(contamination=fraction, **kwargs) - full_name = 'Connectivity-Based Outlier Factor' - - elif model == 'iforest': - from pyod.models.iforest import IForest - model = IForest(contamination=fraction, behaviour = 'new', random_state=seed, **kwargs) - full_name = 'Isolation Forest' - - elif model == 'histogram': - from pyod.models.hbos import HBOS - model = HBOS(contamination=fraction, **kwargs) - full_name = 'Histogram-based Outlier Detection' - - elif model == 'knn': - from pyod.models.knn import KNN - model = KNN(contamination=fraction, **kwargs) - full_name = 'k-Nearest Neighbors Detector' - - elif model == 'lof': - from pyod.models.lof import LOF - model = LOF(contamination=fraction, **kwargs) - full_name = 'Local Outlier Factor' - - elif model == 'svm': - from pyod.models.ocsvm import OCSVM - model = OCSVM(contamination=fraction, **kwargs) - full_name = 'One-class SVM detector' - - elif model == 'pca': - from pyod.models.pca import PCA - model = PCA(contamination=fraction, random_state=seed, **kwargs) - full_name = 'Principal Component Analysis' - - elif model == 'mcd': - from pyod.models.mcd import MCD - model = MCD(contamination=fraction, random_state=seed, **kwargs) - full_name = 'Minimum Covariance Determinant' - - elif model == 'sod': - from pyod.models.sod import SOD - model = SOD(contamination=fraction, **kwargs) - full_name = 'Subspace Outlier Detection' - - elif model == 'sos': - from pyod.models.sos import SOS - model = SOS(contamination=fraction, **kwargs) - full_name = 'Stochastic Outlier Selection' - - else: - def get_model_name(e): - return str(e).split("(")[0] - - model == model - full_name = get_model_name(model) - - logger.info(str(full_name) + ' Imported succesfully') - - #monitor update - monitor.iloc[1,1:] = 'Fitting the Model' - progress.value += 1 - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - #fitting the model - model_fit_start = time.time() - logger.info("Fitting Model") - model.fit(X) - model_fit_end = time.time() - - model_fit_time = np.array(model_fit_end - model_fit_start).round(2) - - #end runtime - runtime_end = time.time() - runtime = np.array(runtime_end - runtime_start).round(2) - - #mlflow logging - if logging_param and system: - - logger.info("Creating MLFlow logs") - - #Creating Logs message monitor - monitor.iloc[1,1:] = 'Creating Logs' - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - #import mlflow - import mlflow - from pathlib import Path - import os - - mlflow.set_experiment(exp_name_log) - - with mlflow.start_run(run_name=full_name) as run: - - # Get active run to log as tag - RunID = mlflow.active_run().info.run_id - - # Log model parameters - params = model.get_params() - - for i in list(params): - v = params.get(i) - if len(str(v)) > 250: - params.pop(i) - - mlflow.log_params(params) - - #set tag of compare_models - mlflow.set_tag("Source", "create_model") - - import secrets - URI = secrets.token_hex(nbytes=4) - mlflow.set_tag("URI", URI) - mlflow.set_tag("USI", USI) - mlflow.set_tag("Run Time", runtime) - mlflow.set_tag("Run ID", RunID) - - # Log training time in seconds - mlflow.log_metric("TT", model_fit_time) - - # Log AUC and Confusion Matrix plot - if log_plots_param: - - logger.info("SubProcess plot_model() called ==================================") - - try: - plot_model(model, plot = 'tsne', save=True, system=False) - mlflow.log_artifact('TSNE.html') - os.remove("TSNE.html") - except: - pass - - logger.info("SubProcess plot_model() end ==================================") - - # Log model and transformation pipeline - logger.info("SubProcess save_model() called ==================================") - save_model(model, 'Trained Model', verbose=False) - logger.info("SubProcess save_model() end ==================================") - mlflow.log_artifact('Trained Model' + '.pkl') - size_bytes = Path('Trained Model.pkl').stat().st_size - size_kb = np.round(size_bytes/1000, 2) - mlflow.set_tag("Size KB", size_kb) - os.remove('Trained Model.pkl') - - progress.value += 1 - - if verbose: - clear_output() - - logger.info(str(model)) - logger.info("create_models() succesfully completed......................................") - - return model - -def assign_model(model, - transformation=False, - score=True, - verbose=True): - - """ - - Description: - ------------ - This function flags each of the data point in the dataset passed during setup - stage as either outlier or inlier (1 = outlier, 0 = inlier) using trained model - object passed as model param. create_model() function must be called before using - assign_model(). - - This function returns dataframe with Outlier flag (1 = outlier, 0 = inlier) and - decision score, when score is set to True. - - Example - ------- - from pycaret.datasets import get_data - anomaly = get_data('anomaly') - experiment_name = setup(data = anomaly, normalize = True) - knn = create_model('knn') - - knn_df = assign_model(knn) - - This will return a dataframe with inferred outliers using trained model. - - Parameters - ---------- - model : trained model object, default = None - - transformation: bool, default = False - When set to True, assigned outliers are returned on transformed dataset instead - of original dataset passed during setup(). - - score: Boolean, default = True - The outlier scores of the training data. The higher, the more abnormal. - Outliers tend to have higher scores. This value is available once the model - is fitted. If set to False, it will only return the flag (1 = outlier, 0 = inlier). - - verbose: Boolean, default = True - Status update is not printed when verbose is set to False. - - Returns: - -------- - - dataframe: Returns a dataframe with inferred outliers using a trained model. - --------- - - """ - - #exception checking - import sys - - import logging - - try: - hasattr(logger, 'name') - except: - logger = logging.getLogger('logs') - logger.setLevel(logging.DEBUG) - - # create console handler and set level to debug - if logger.hasHandlers(): - logger.handlers.clear() - - ch = logging.FileHandler('logs.log') - ch.setLevel(logging.DEBUG) - - # create formatter - formatter = logging.Formatter('%(asctime)s:%(levelname)s:%(message)s') - - # add formatter to ch - ch.setFormatter(formatter) - - # add ch to logger - logger.addHandler(ch) - - logger.info("Initializing assign_model()") - logger.info("""assign_model(model={}, transformation={}, score={}, verbose={})""".\ - format(str(model), str(transformation), str(score), str(verbose))) - - #ignore warnings - import warnings - warnings.filterwarnings('ignore') - - - """ - error handling starts here - """ - - #determine model type and store in string - mod_type = str(type(model)) - - #checking for allowed models - allowed_type = ['pyod'] - if 'pyod' not in mod_type: - sys.exit('(Value Error): Model Not Recognized. Please see docstring for list of available models.') - - #checking transformation parameter - if type(transformation) is not bool: - sys.exit('(Type Error): Transformation parameter can only take argument as True or False.') - - #checking verbose parameter - if type(score) is not bool: - sys.exit('(Type Error): Score parameter can only take argument as True or False.') - - #checking verbose parameter - if type(verbose) is not bool: - sys.exit('(Type Error): Verbose parameter can only take argument as True or False.') - - - """ - error handling ends here - """ - - logger.info("Preloading libraries") - #pre-load libraries - import numpy as np - import pandas as pd - import ipywidgets as ipw - from IPython.display import display, HTML, clear_output, update_display - import datetime, time - - logger.info("Copying data") - #copy data_ - if transformation: - data__ = X.copy() - else: - data__ = data_.copy() - - logger.info("Preparing display monitor") - #progress bar and monitor control - timestampStr = datetime.datetime.now().strftime("%H:%M:%S") - progress = ipw.IntProgress(value=0, min=0, max=3, step=1 , description='Processing: ') - monitor = pd.DataFrame( [ ['Initiated' , '. . . . . . . . . . . . . . . . . .', timestampStr ], - ['Status' , '. . . . . . . . . . . . . . . . . .' , 'Initializing'] ], - columns=['', ' ', ' ']).set_index('') - if verbose: - if html_param: - display(progress) - display(monitor, display_id = 'monitor') - - progress.value += 1 - - monitor.iloc[1,1:] = 'Inferring Outliers from Model' - - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - progress.value += 1 - - #calculation labels and attaching to dataframe - pred_labels = model.labels_ - data__['Label'] = pred_labels - - progress.value += 1 - - #calculating score and attaching to dataframe - if score: - pred_score = model.decision_scores_ - data__['Score'] = pred_score - - progress.value += 1 - - logger.info("Determining Trained Model") - - mod_type = str(model).split("(")[0] - - if 'ABOD' in mod_type: - name_ = 'Angle-base Outlier Detection' - - elif 'IForest' in mod_type: - name_ = 'Isolation Forest' - - elif 'CBLOF' in mod_type: - name_ = 'Clustering-Based Local Outlier' - - elif 'COF' in mod_type: - name_ = 'Connectivity-Based Outlier Factor' - - elif 'HBOS' in mod_type: - name_ = 'Histogram-based Outlier Detection' - - elif 'KNN' in mod_type: - name_ = 'k-Nearest Neighbors Detector' - - elif 'LOF' in mod_type: - name_ = 'Local Outlier Factor' - - elif 'OCSVM' in mod_type: - name_ = 'One-class SVM detector' - - elif 'PCA' in mod_type: - name_ = 'Principal Component Analysis' - - elif 'MCD' in mod_type: - name_ = 'Minimum Covariance Determinant' - - elif 'SOD' in mod_type: - name_ = 'Subspace Outlier Detection' - - elif 'SOS' in mod_type: - name_ = 'Stochastic Outlier Selection' - - else: - name_ = 'Unknown Anomaly Detector' - - name_ = 'Assigned ' + str(name_) - - logger.info("Trained Model : " + str(name_)) - - if verbose: - clear_output() - - logger.info(str(data__.shape)) - logger.info("assign_model() succesfully completed......................................") - - return data__ - -def tune_model(model=None, - supervised_target=None, - method='drop', - estimator=None, - optimize=None, - custom_grid = None, #added in pycaret 2.0.0 - fold=10, - verbose=True): #added in pycaret 2.0.0 - - - """ - - Description: - ------------ - This function tunes the fraction parameter using a predefined grid with - the objective of optimizing a supervised learning metric as defined in - the optimize param. You can choose the supervised estimator from a large - library available in pycaret. By default, supervised estimator is Linear. - - This function returns the tuned model object. - - Example - ------- - from pycaret.datasets import get_data - boston = get_data('boston') - experiment_name = setup(data = boston, normalize = True) - - tuned_knn = tune_model(model = 'knn', supervised_target = 'medv') - - This will return tuned k-Nearest Neighbors model. - - Parameters - ---------- - model : string, default = None - - Enter ID of the models available in model library: - - ID Model - ------- --------- - 'abod' Angle-base Outlier Detection - 'cluster' Clustering-Based Local Outlier - 'cof' Connectivity-Based Outlier Factor - 'histogram' Histogram-based Outlier Detection - 'knn' k-Nearest Neighbors Detector - 'lof' Local Outlier Factor - 'svm' One-class SVM detector - 'pca' Principal Component Analysis - 'mcd' Minimum Covariance Determinant - 'sod' Subspace Outlier Detection - 'sos' Stochastic Outlier Selection - - supervised_target: string - Name of the target column for supervised learning. - - method: string, default = 'drop' - When method set to drop, it will drop the outlier rows from training dataset - of supervised estimator, when method set to 'surrogate', it will use the - decision function and label as a feature without dropping the outliers from - training dataset. - - estimator: string, default = None - - ID Name Task - -------- ---------- ---------- - 'lr' Logistic Regression Classification - 'knn' K Nearest Neighbour Classification - 'nb' Naive Bayes Classification - 'dt' Decision Tree Classifier Classification - 'svm' SVM - Linear Kernel Classification - 'rbfsvm' SVM - Radial Kernel Classification - 'gpc' Gaussian Process Classifier Classification - 'mlp' Multi Level Perceptron Classification - 'ridge' Ridge Classifier Classification - 'rf' Random Forest Classifier Classification - 'qda' Quadratic Discriminant Analysis Classification - 'ada' Ada Boost Classifier Classification - 'gbc' Gradient Boosting Classifier Classification - 'lda' Linear Discriminant Analysis Classification - 'et' Extra Trees Classifier Classification - 'xgboost' Extreme Gradient Boosting Classification - 'lightgbm' Light Gradient Boosting Classification - 'catboost' CatBoost Classifier Classification - 'lr' Linear Regression Regression - 'lasso' Lasso Regression Regression - 'ridge' Ridge Regression Regression - 'en' Elastic Net Regression - 'lar' Least Angle Regression Regression - 'llar' Lasso Least Angle Regression Regression - 'omp' Orthogonal Matching Pursuit Regression - 'br' Bayesian Ridge Regression - 'ard' Automatic Relevance Determ. Regression - 'par' Passive Aggressive Regressor Regression - 'ransac' Random Sample Consensus Regression - 'tr' TheilSen Regressor Regression - 'huber' Huber Regressor Regression - 'kr' Kernel Ridge Regression - 'svm' Support Vector Machine Regression - 'knn' K Neighbors Regressor Regression - 'dt' Decision Tree Regression - 'rf' Random Forest Regression - 'et' Extra Trees Regressor Regression - 'ada' AdaBoost Regressor Regression - 'gbr' Gradient Boosting Regression - 'mlp' Multi Level Perceptron Regression - 'xgboost' Extreme Gradient Boosting Regression - 'lightgbm' Light Gradient Boosting Regression - 'catboost' CatBoost Regressor Regression - - If set to None, Linear model is used by default for both classification - and regression tasks. - - optimize: string, default = None - - custom_grid: list, default = None - By default, a pre-defined list of fraction values is iterated over to - optimize the supervised objective. To overwrite default iteration, - pass a list of fraction value to iterate over in custom_grid param. - - For Classification tasks: - Accuracy, AUC, Recall, Precision, F1, Kappa - - For Regression tasks: - MAE, MSE, RMSE, R2, RMSLE, MAPE - - If set to None, default is 'Accuracy' for classification and 'R2' for - regression tasks. - - fold: integer, default = 10 - Number of folds to be used in Kfold CV. Must be at least 2. - - verbose: Boolean, default = True - Status update is not printed when verbose is set to False. - - Returns: - -------- - - visual plot: Visual plot with fraction param on x-axis with metric to - ----------- optimize on y-axis. Also, prints the best model metric. - - model: trained model object with best fraction param. - ----------- - - - """ - - - - """ - exception handling starts here - """ - - global data_, X - - import logging - - try: - hasattr(logger, 'name') - except: - logger = logging.getLogger('logs') - logger.setLevel(logging.DEBUG) - - # create console handler and set level to debug - if logger.hasHandlers(): - logger.handlers.clear() - - ch = logging.FileHandler('logs.log') - ch.setLevel(logging.DEBUG) - - # create formatter - formatter = logging.Formatter('%(asctime)s:%(levelname)s:%(message)s') - - # add formatter to ch - ch.setFormatter(formatter) - - # add ch to logger - logger.addHandler(ch) - - logger.info("Initializing tune_model()") - logger.info("""tune_model(model={}, supervised_target={}, method={}, estimator={}, optimize={}, custom_grid={}, fold={}, verbose={})""".\ - format(str(model), str(supervised_target), str(method), str(estimator), str(optimize), str(custom_grid), str(fold), str(verbose))) - - logger.info("Checking exceptions") - - #ignore warnings - import warnings - warnings.filterwarnings('ignore') - - import sys - - #run_time - import datetime, time - runtime_start = time.time() - - #checking for model parameter - if model is None: - sys.exit('(Value Error): Model parameter Missing. Please see docstring for list of available models.') - - #checking for allowed models - allowed_models = ['abod', 'iforest', 'cluster', 'cof', 'histogram', 'knn', 'lof', 'svm', 'pca', 'mcd', 'sod', 'sos'] - - if model not in allowed_models: - sys.exit('(Value Error): Model Not Available for Tuning. Please see docstring for list of available models.') - - #check method - allowed_methods = ['drop', 'surrogate'] - if method not in allowed_methods: - sys.exit('(Value Error): Method not recognized. See docstring for list of available methods.') - - #check if supervised target is None: - if supervised_target is None: - sys.exit('(Value Error): supervised_target cannot be None. A column name must be given for estimator.') - - #check supervised target - if supervised_target is not None: - all_col = list(data_.columns) - if supervised_target not in all_col: - sys.exit('(Value Error): supervised_target not recognized. It can only be one of the following: ' + str(all_col)) - - #checking estimator: - if estimator is not None: - - available_estimators = ['lr', 'knn', 'nb', 'dt', 'svm', 'rbfsvm', 'gpc', 'mlp', 'ridge', 'rf', 'qda', 'ada', - 'gbc', 'lda', 'et', 'lasso', 'ridge', 'en', 'lar', 'llar', 'omp', 'br', 'ard', 'par', - 'ransac', 'tr', 'huber', 'kr', 'svm', 'knn', 'dt', 'rf', 'et', 'ada', 'gbr', - 'mlp', 'xgboost', 'lightgbm', 'catboost'] - - if estimator not in available_estimators: - sys.exit('(Value Error): Estimator Not Available. Please see docstring for list of available estimators.') - - - #checking optimize parameter - if optimize is not None: - - available_optimizers = ['MAE', 'MSE', 'RMSE', 'R2', 'RMSLE', 'MAPE', 'Accuracy', 'AUC', 'Recall', 'Precision', 'F1', 'Kappa'] - - if optimize not in available_optimizers: - sys.exit('(Value Error): optimize parameter Not Available. Please see docstring for list of available parameters.') - - #checking fold parameter - if type(fold) is not int: - sys.exit('(Type Error): Fold parameter only accepts integer value.') - - - """ - exception handling ends here - """ - - logger.info("Preloading libraries") - - #pre-load libraries - import pandas as pd - import ipywidgets as ipw - from ipywidgets import Output - from IPython.display import display, HTML, clear_output, update_display - import datetime, time - - logger.info("Preparing display monitor") - - #progress bar - if custom_grid is None: - max_steps = 25 - else: - max_steps = 15 + len(custom_grid) - - progress = ipw.IntProgress(value=0, min=0, max=max_steps, step=1 , description='Processing: ') - - if verbose: - if html_param: - display(progress) - - timestampStr = datetime.datetime.now().strftime("%H:%M:%S") - - monitor = pd.DataFrame( [ ['Initiated' , '. . . . . . . . . . . . . . . . . .', timestampStr ], - ['Status' , '. . . . . . . . . . . . . . . . . .' , 'Loading Dependencies'], - ['Step' , '. . . . . . . . . . . . . . . . . .', 'Initializing' ] ], - columns=['', ' ', ' ']).set_index('') - - monitor_out = Output() - - if verbose: - if html_param: - display(monitor_out) - with monitor_out: - display(monitor, display_id = 'monitor') - - logger.info("Importing libraries") - - #General Dependencies - from sklearn.linear_model import LogisticRegression - from sklearn.model_selection import cross_val_predict - from sklearn import metrics - import numpy as np - import plotly.express as px - from copy import deepcopy - from sklearn.preprocessing import StandardScaler - scaler = StandardScaler() - - logger.info("Copying environment variables") - - a = data_.copy() - b = X.copy() - c = deepcopy(prep_pipe) - e = exp_name_log - z = logging_param - - def retain_original(a,b,c,e,z): - - global data_, X, prep_pipe, exp_name_log, logging_param - - data_ = a.copy() - X = b.copy() - prep_pipe = deepcopy(c) - exp_name_log = e - logging_param = z - - return data_, X, prep_pipe, exp_name_log, logging_param - - #setting up cufflinks - import cufflinks as cf - cf.go_offline() - cf.set_config_file(offline=False, world_readable=True) - - progress.value += 1 - - #define the problem - if data_[supervised_target].value_counts().count() == 2: - problem = 'classification' - logger.info("Objective : Classification") - else: - problem = 'regression' - logger.info("Objective : Regression") - - #define model name - - logger.info("Defining Model Name") - - if model == 'abod': - model_name = 'Angle-base Outlier Detection' - elif model == 'iforest': - model_name = 'Isolation Forest' - elif model == 'cluster': - model_name = 'Clustering-Based Local Outlier' - elif model == 'cof': - model_name = 'Connectivity-Based Outlier Factor' - elif model == 'histogram': - model_name = 'Histogram-based Outlier Detection' - elif model == 'knn': - model_name = 'k-Nearest Neighbors Detector' - elif model == 'lof': - model_name = 'Local Outlier Factor' - elif model == 'svm': - model_name = 'One-class SVM detector' - elif model == 'pca': - model_name = 'Principal Component Analysis' - elif model == 'mcd': - model_name = 'Minimum Covariance Determinant' - elif model == 'sod': - model_name = 'Subspace Outlier Detection' - elif model == 'sos': - model_name = 'Stochastic Outlier Selection' - - logger.info("Defining Supervised Estimator") - - #defining estimator: - if problem == 'classification' and estimator is None: - estimator = 'lr' - elif problem == 'regression' and estimator is None: - estimator = 'lr' - else: - estimator = estimator - - logger.info("Defining Optimizer") - - #defining optimizer: - if optimize is None and problem == 'classification': - optimize = 'Accuracy' - elif optimize is None and problem == 'regression': - optimize = 'R2' - else: - optimize=optimize - - logger.info("Optimize: " + str(optimize)) - - progress.value += 1 - - #defining tuning grid - logger.info("Defining Tuning Grid") - - if custom_grid is not None: - - logger.info("Custom Grid used") - param_grid = custom_grid - param_grid_with_zero = [0] - - for i in param_grid: - param_grid_with_zero.append(i) - - else: - - logger.info("Pre-defined Grid used") - param_grid_with_zero = [0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.10] - param_grid = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.10] - - master = []; master_df = [] - - monitor.iloc[1,1:] = 'Creating Outlier Detection Model' - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - """ - preprocess starts here - """ - - logger.info("Defining setup variables for preprocessing") - - #removing target variable from data by defining new setup - _data_ = data_.copy() - target_ = pd.DataFrame(_data_[supervised_target]) - from sklearn.preprocessing import LabelEncoder - le = LabelEncoder() - target_ = le.fit_transform(target_) - - cat_pass = prep_param.dtypes.categorical_features - num_pass = prep_param.dtypes.numerical_features - time_pass = prep_param.dtypes.time_features - ignore_pass = prep_param.dtypes.features_todrop - - #PCA - if 'Empty' in str(prep_param.pca): - pca_pass = False - pca_method_pass = 'linear' - - else: - pca_pass = True - - if prep_param.pca.method == 'pca_liner': - pca_method_pass = 'linear' - elif prep_param.pca.method == 'pca_kernal': - pca_method_pass = 'kernel' - elif prep_param.pca.method == 'incremental': - pca_method_pass = 'incremental' - - if pca_pass is True: - pca_comp_pass = prep_param.pca.variance_retained - else: - pca_comp_pass = 0.99 - - #IMPUTATION - if 'not_available' in prep_param.imputer.categorical_strategy: - cat_impute_pass = 'constant' - elif 'most frequent' in prep_param.imputer.categorical_strategy: - cat_impute_pass = 'mode' - - num_impute_pass = prep_param.imputer.numeric_strategy - - #NORMALIZE - if 'Empty' in str(prep_param.scaling): - normalize_pass = False - else: - normalize_pass = True - - if normalize_pass is True: - normalize_method_pass = prep_param.scaling.function_to_apply - else: - normalize_method_pass = 'zscore' - - #FEATURE TRANSFORMATION - if 'Empty' in str(prep_param.P_transform): - transformation_pass = False - else: - transformation_pass = True - - if transformation_pass is True: - - if 'yj' in prep_param.P_transform.function_to_apply: - transformation_method_pass = 'yeo-johnson' - elif 'quantile' in prep_param.P_transform.function_to_apply: - transformation_method_pass = 'quantile' - - else: - transformation_method_pass = 'yeo-johnson' - - #BIN NUMERIC FEATURES - if 'Empty' in str(prep_param.binn): - features_to_bin_pass = [] - apply_binning_pass = False - - else: - features_to_bin_pass = prep_param.binn.features_to_discretize - apply_binning_pass = True - - #COMBINE RARE LEVELS - if 'Empty' in str(prep_param.club_R_L): - combine_rare_levels_pass = False - combine_rare_threshold_pass = 0.1 - else: - combine_rare_levels_pass = True - combine_rare_threshold_pass = prep_param.club_R_L.threshold - - #ZERO NERO ZERO VARIANCE - if 'Empty' in str(prep_param.znz): - ignore_low_variance_pass = False - else: - ignore_low_variance_pass = True - - #MULTI-COLLINEARITY - if 'Empty' in str(prep_param.fix_multi): - remove_multicollinearity_pass = False - else: - remove_multicollinearity_pass = True - - if remove_multicollinearity_pass is True: - multicollinearity_threshold_pass = prep_param.fix_multi.threshold - else: - multicollinearity_threshold_pass = 0.9 - - #UNKNOWN CATEGORICAL LEVEL - if 'Empty' in str(prep_param.new_levels): - handle_unknown_categorical_pass = False - else: - handle_unknown_categorical_pass = True - - if handle_unknown_categorical_pass is True: - unknown_level_preprocess = prep_param.new_levels.replacement_strategy - if unknown_level_preprocess == 'least frequent': - unknown_categorical_method_pass = 'least_frequent' - elif unknown_level_preprocess == 'most frequent': - unknown_categorical_method_pass = 'most_frequent' - else: - unknown_categorical_method_pass = 'least_frequent' - else: - unknown_categorical_method_pass = 'least_frequent' - - #GROUP FEATURES - if 'Empty' in str(prep_param.group): - apply_grouping_pass = False - else: - apply_grouping_pass = True - - if apply_grouping_pass is True: - group_features_pass = prep_param.group.list_of_similar_features - else: - group_features_pass = None - - if apply_grouping_pass is True: - group_names_pass = prep_param.group.group_name - else: - group_names_pass = None - - #ORDINAL FEATURES - if 'Empty' in str(prep_param.ordinal): - ordinal_features_pass = None - else: - ordinal_features_pass = prep_param.ordinal.info_as_dict - - #HIGH CARDINALITY - if 'Empty' in str(prep_param.cardinality): - high_cardinality_features_pass = None - else: - high_cardinality_features_pass = prep_param.cardinality.feature - - global setup_without_target - - logger.info("SubProcess setup() called") - - setup_without_target = setup(data = data_, - categorical_features = cat_pass, - categorical_imputation = cat_impute_pass, - ordinal_features = ordinal_features_pass, #new - high_cardinality_features = high_cardinality_features_pass, #latest - numeric_features = num_pass, - numeric_imputation = num_impute_pass, - date_features = time_pass, - ignore_features = ignore_pass, - normalize = normalize_pass, - normalize_method = normalize_method_pass, - transformation = transformation_pass, - transformation_method = transformation_method_pass, - handle_unknown_categorical = handle_unknown_categorical_pass, - unknown_categorical_method = unknown_categorical_method_pass, - pca = pca_pass, - pca_components = pca_comp_pass, - pca_method = pca_method_pass, - ignore_low_variance = ignore_low_variance_pass, - combine_rare_levels = combine_rare_levels_pass, - rare_level_threshold = combine_rare_threshold_pass, - bin_numeric_features = features_to_bin_pass, - remove_multicollinearity = remove_multicollinearity_pass, - multicollinearity_threshold = multicollinearity_threshold_pass, - group_features = group_features_pass, - group_names = group_names_pass, - supervised = True, - supervised_target = supervised_target, - session_id = seed, - log_experiment = False, #added in pycaret==2.0.0 - profile=False, - verbose=False) - - data_without_target = setup_without_target[0] - - logger.info("SubProcess setup() end") - - """ - preprocess ends here - """ - - #adding dummy model in master - master.append('No Model Required') - master_df.append('No Model Required') - - model_fit_time_list = [] - - for i in param_grid: - logger.info("Fitting Model with Fraction = " +str(i)) - progress.value += 1 - monitor.iloc[2,1:] = 'Fitting Model With ' + str(i) + ' Fraction' - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - #create and assign the model to dataset d - model_fit_start = time.time() - logger.info("SubProcess create_model() called==================================") - m = create_model(model=model, fraction=i, verbose=False, system=False) - logger.info("SubProcess create_model() end==================================") - model_fit_end = time.time() - model_fit_time = np.array(model_fit_end - model_fit_start).round(2) - model_fit_time_list.append(model_fit_time) - - logger.info("Generating labels") - logger.info("SubProcess assign_model() called==================================") - d = assign_model(m, transformation=True, score=True, verbose=False) - logger.info("SubProcess assign_model() ends==================================") - d[str(supervised_target)] = target_ - - master.append(m) - master_df.append(d) - - - #attaching target variable back - data_[str(supervised_target)] = target_ - - logger.info("Defining Supervised Estimator") - - if problem == 'classification': - - logger.info("Problem : Classification") - - """ - - defining estimator - - """ - - monitor.iloc[1,1:] = 'Evaluating Anomaly Model' - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - if estimator == 'lr': - - from sklearn.linear_model import LogisticRegression - model = LogisticRegression(random_state=seed) - full_name = 'Logistic Regression' - - elif estimator == 'knn': - - from sklearn.neighbors import KNeighborsClassifier - model = KNeighborsClassifier() - full_name = 'K Nearest Neighbours' - - elif estimator == 'nb': - - from sklearn.naive_bayes import GaussianNB - model = GaussianNB() - full_name = 'Naive Bayes' - - elif estimator == 'dt': - - from sklearn.tree import DecisionTreeClassifier - model = DecisionTreeClassifier(random_state=seed) - full_name = 'Decision Tree' - - elif estimator == 'svm': - - from sklearn.linear_model import SGDClassifier - model = SGDClassifier(max_iter=1000, tol=0.001, random_state=seed) - full_name = 'Support Vector Machine' - - elif estimator == 'rbfsvm': - - from sklearn.svm import SVC - model = SVC(gamma='auto', C=1, probability=True, kernel='rbf', random_state=seed) - full_name = 'RBF SVM' - - elif estimator == 'gpc': - - from sklearn.gaussian_process import GaussianProcessClassifier - model = GaussianProcessClassifier(random_state=seed) - full_name = 'Gaussian Process Classifier' - - elif estimator == 'mlp': - - from sklearn.neural_network import MLPClassifier - model = MLPClassifier(max_iter=500, random_state=seed) - full_name = 'Multi Level Perceptron' - - elif estimator == 'ridge': - - from sklearn.linear_model import RidgeClassifier - model = RidgeClassifier(random_state=seed) - full_name = 'Ridge Classifier' - - elif estimator == 'rf': - - from sklearn.ensemble import RandomForestClassifier - model = RandomForestClassifier(n_estimators=10, random_state=seed) - full_name = 'Random Forest Classifier' - - elif estimator == 'qda': - - from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis - model = QuadraticDiscriminantAnalysis() - full_name = 'Quadratic Discriminant Analysis' - - elif estimator == 'ada': - - from sklearn.ensemble import AdaBoostClassifier - model = AdaBoostClassifier(random_state=seed) - full_name = 'AdaBoost Classifier' - - elif estimator == 'gbc': - - from sklearn.ensemble import GradientBoostingClassifier - model = GradientBoostingClassifier(random_state=seed) - full_name = 'Gradient Boosting Classifier' - - elif estimator == 'lda': - - from sklearn.discriminant_analysis import LinearDiscriminantAnalysis - model = LinearDiscriminantAnalysis() - full_name = 'Linear Discriminant Analysis' - - elif estimator == 'et': - - from sklearn.ensemble import ExtraTreesClassifier - model = ExtraTreesClassifier(random_state=seed) - full_name = 'Extra Trees Classifier' - - elif estimator == 'xgboost': - - from xgboost import XGBClassifier - model = XGBClassifier(random_state=seed, n_jobs=-1, verbosity=0) - full_name = 'Extreme Gradient Boosting' - - elif estimator == 'lightgbm': - - import lightgbm as lgb - model = lgb.LGBMClassifier(random_state=seed) - full_name = 'Light Gradient Boosting Machine' - - elif estimator == 'catboost': - from catboost import CatBoostClassifier - model = CatBoostClassifier(random_state=seed, silent=True) # Silent is True to suppress CatBoost iteration results - full_name = 'CatBoost Classifier' - - logger.info(str(full_name) + " Imported Successfully") - - progress.value += 1 - - """ - start model building here - - """ - - logger.info("Creating Classifier without Anomaly") - acc = []; auc = []; recall = []; prec = []; kappa = []; f1 = [] - - #build model without anomaly - monitor.iloc[2,1:] = 'Evaluating Classifier Without Anomaly Detector' - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - d = master_df[1].copy() - d.drop(['Label', 'Score'], axis=1, inplace=True) - - #drop NA's caution - d.dropna(axis=0, inplace=True) - - #get_dummies to caste categorical variables for supervised learning - d = pd.get_dummies(d) - - #split the dataset - X = d.drop(supervised_target, axis=1) - y = d[supervised_target] - - #fit the model - logger.info("Fitting Model") - model.fit(X,y) - - #generate the prediction and evaluate metric - logger.info("Evaluating Cross Val Predictions") - pred = cross_val_predict(model,X,y,cv=fold, method = 'predict') - - acc_ = metrics.accuracy_score(y,pred) - acc.append(acc_) - - recall_ = metrics.recall_score(y,pred) - recall.append(recall_) - - precision_ = metrics.precision_score(y,pred) - prec.append(precision_) - - kappa_ = metrics.cohen_kappa_score(y,pred) - kappa.append(kappa_) - - f1_ = metrics.f1_score(y,pred) - f1.append(f1_) - - if hasattr(model,'predict_proba'): - pred_ = cross_val_predict(model,X,y,cv=fold, method = 'predict_proba') - pred_prob = pred_[:,1] - auc_ = metrics.roc_auc_score(y,pred_prob) - auc.append(auc_) - - else: - auc.append(0) - - for i in range(1,len(master_df)): - progress.value += 1 - param_grid_val = param_grid[i-1] - - logger.info("Creating Classifier with Fraction = " + str(param_grid_val)) - - monitor.iloc[2,1:] = 'Evaluating Classifier With ' + str(param_grid_val) + ' Fraction' - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - #prepare the dataset for supervised problem - d = master_df[i] - - #cleaning the dataframe for supervised learning - d.dropna(axis=0, inplace=True) - Score_ = pd.DataFrame(d['Score']) - Score = scaler.fit_transform(Score_) - d['Score'] = Score - - if method == 'drop': - d = d[d['Label'] == 0] - d.drop(['Label'], axis=1, inplace=True) - - #get_dummies to caste categorical variables for supervised learning - d = pd.get_dummies(d) - - #split the dataset - X = d.drop(supervised_target, axis=1) - y = d[supervised_target] - - #fit the model - logger.info("Fitting Model") - model.fit(X,y) - - #generate the prediction and evaluate metric - logger.info("Generating Cross Val Predictions") - pred = cross_val_predict(model,X,y,cv=fold, method = 'predict') - - acc_ = metrics.accuracy_score(y,pred) - acc.append(acc_) - - recall_ = metrics.recall_score(y,pred) - recall.append(recall_) - - precision_ = metrics.precision_score(y,pred) - prec.append(precision_) - - kappa_ = metrics.cohen_kappa_score(y,pred) - kappa.append(kappa_) - - f1_ = metrics.f1_score(y,pred) - f1.append(f1_) - - if hasattr(model,'predict_proba'): - pred_ = cross_val_predict(model,X,y,cv=fold, method = 'predict_proba') - pred_prob = pred_[:,1] - auc_ = metrics.roc_auc_score(y,pred_prob) - auc.append(auc_) - - else: - auc.append(0) - - - monitor.iloc[1,1:] = 'Compiling Results' - monitor.iloc[1,1:] = 'Finalizing' - - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - logger.info("Creating metrics dataframe") - df = pd.DataFrame({'Fraction %': param_grid_with_zero, 'Accuracy' : acc, 'AUC' : auc, 'Recall' : recall, - 'Precision' : prec, 'F1' : f1, 'Kappa' : kappa}) - - sorted_df = df.sort_values(by=optimize, ascending=False) - ival = sorted_df.index[0] - - best_model = master[ival] - best_model_df = master_df[ival] - best_model_tt = model_fit_time_list[ival] - - progress.value += 1 - logger.info("Rendering Visual") - sd = pd.melt(df, id_vars=['Fraction %'], value_vars=['Accuracy', 'AUC', 'Recall', 'Precision', 'F1', 'Kappa'], - var_name='Metric', value_name='Score') - - fig = px.line(sd, x='Fraction %', y='Score', color='Metric', line_shape='linear', range_y = [0,1]) - fig.update_layout(plot_bgcolor='rgb(245,245,245)') - title= str(full_name) + ' Metrics and Fraction %' - fig.update_layout(title={'text': title, 'y':0.95,'x':0.45,'xanchor': 'center','yanchor': 'top'}) - - fig.show() - logger.info("Visual Rendered Successfully") - - #monitor = '' - - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - if verbose: - if html_param: - monitor_out.clear_output() - progress.close() - - best_k = np.array(sorted_df.head(1)['Fraction %'])[0] - best_m = round(np.array(sorted_df.head(1)[optimize])[0],4) - p = 'Best Model: ' + model_name + ' |' + ' Fraction %: ' + str(best_k) + ' | ' + str(optimize) + ' : ' + str(best_m) - print(p) - - elif problem == 'regression': - - logger.info("Problem : Regression") - - """ - - defining estimator - - """ - - monitor.iloc[1,1:] = 'Evaluating Anomaly Model' - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - if estimator == 'lr': - - from sklearn.linear_model import LinearRegression - model = LinearRegression() - full_name = 'Linear Regression' - - elif estimator == 'lasso': - - from sklearn.linear_model import Lasso - model = Lasso(random_state=seed) - full_name = 'Lasso Regression' - - elif estimator == 'ridge': - - from sklearn.linear_model import Ridge - model = Ridge(random_state=seed) - full_name = 'Ridge Regression' - - elif estimator == 'en': - - from sklearn.linear_model import ElasticNet - model = ElasticNet(random_state=seed) - full_name = 'Elastic Net' - - elif estimator == 'lar': - - from sklearn.linear_model import Lars - model = Lars() - full_name = 'Least Angle Regression' - - elif estimator == 'llar': - - from sklearn.linear_model import LassoLars - model = LassoLars() - full_name = 'Lasso Least Angle Regression' - - elif estimator == 'omp': - - from sklearn.linear_model import OrthogonalMatchingPursuit - model = OrthogonalMatchingPursuit() - full_name = 'Orthogonal Matching Pursuit' - - elif estimator == 'br': - from sklearn.linear_model import BayesianRidge - model = BayesianRidge() - full_name = 'Bayesian Ridge Regression' - - elif estimator == 'ard': - - from sklearn.linear_model import ARDRegression - model = ARDRegression() - full_name = 'Automatic Relevance Determination' - - elif estimator == 'par': - - from sklearn.linear_model import PassiveAggressiveRegressor - model = PassiveAggressiveRegressor(random_state=seed) - full_name = 'Passive Aggressive Regressor' - - elif estimator == 'ransac': - - from sklearn.linear_model import RANSACRegressor - model = RANSACRegressor(random_state=seed) - full_name = 'Random Sample Consensus' - - elif estimator == 'tr': - - from sklearn.linear_model import TheilSenRegressor - model = TheilSenRegressor(random_state=seed) - full_name = 'TheilSen Regressor' - - elif estimator == 'huber': - - from sklearn.linear_model import HuberRegressor - model = HuberRegressor() - full_name = 'Huber Regressor' - - elif estimator == 'kr': - - from sklearn.kernel_ridge import KernelRidge - model = KernelRidge() - full_name = 'Kernel Ridge' - - elif estimator == 'svm': - - from sklearn.svm import SVR - model = SVR() - full_name = 'Support Vector Regression' - - elif estimator == 'knn': - - from sklearn.neighbors import KNeighborsRegressor - model = KNeighborsRegressor() - full_name = 'Nearest Neighbors Regression' - - elif estimator == 'dt': - - from sklearn.tree import DecisionTreeRegressor - model = DecisionTreeRegressor(random_state=seed) - full_name = 'Decision Tree Regressor' - - elif estimator == 'rf': - - from sklearn.ensemble import RandomForestRegressor - model = RandomForestRegressor(random_state=seed) - full_name = 'Random Forest Regressor' - - elif estimator == 'et': - - from sklearn.ensemble import ExtraTreesRegressor - model = ExtraTreesRegressor(random_state=seed) - full_name = 'Extra Trees Regressor' - - elif estimator == 'ada': - - from sklearn.ensemble import AdaBoostRegressor - model = AdaBoostRegressor(random_state=seed) - full_name = 'AdaBoost Regressor' - - elif estimator == 'gbr': - - from sklearn.ensemble import GradientBoostingRegressor - model = GradientBoostingRegressor(random_state=seed) - full_name = 'Gradient Boosting Regressor' - - elif estimator == 'mlp': - - from sklearn.neural_network import MLPRegressor - model = MLPRegressor(random_state=seed) - full_name = 'MLP Regressor' - - elif estimator == 'xgboost': - - from xgboost import XGBRegressor - model = XGBRegressor(random_state=seed, n_jobs=-1, verbosity=0) - full_name = 'Extreme Gradient Boosting Regressor' - - elif estimator == 'lightgbm': - - import lightgbm as lgb - model = lgb.LGBMRegressor(random_state=seed) - full_name = 'Light Gradient Boosting Machine' - - elif estimator == 'catboost': - - from catboost import CatBoostRegressor - model = CatBoostRegressor(random_state=seed, silent = True) - full_name = 'CatBoost Regressor' - - logger.info(str(full_name) + " Imported Successfully") - - progress.value += 1 - - """ - start model building here - - """ - - logger.info("Creating Regressor without anomaly") - - score = [] - metric = [] - - #build model without anomaly - monitor.iloc[2,1:] = 'Evaluating Regressor Without Anomaly Detector' - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - d = master_df[1].copy() - d.drop(['Label', 'Score'], axis=1, inplace=True) - - #drop NA's caution - d.dropna(axis=0, inplace=True) - - #get_dummies to caste categorical variables for supervised learning - d = pd.get_dummies(d) - - #split the dataset - X = d.drop(supervised_target, axis=1) - y = d[supervised_target] - - #fit the model - logger.info("Fitting Model") - model.fit(X,y) - - #generate the prediction and evaluate metric - logger.info("Generating Cross Val Predictions") - pred = cross_val_predict(model,X,y,cv=fold, method = 'predict') - - if optimize == 'R2': - r2_ = metrics.r2_score(y,pred) - score.append(r2_) - - elif optimize == 'MAE': - mae_ = metrics.mean_absolute_error(y,pred) - score.append(mae_) - - elif optimize == 'MSE': - mse_ = metrics.mean_squared_error(y,pred) - score.append(mse_) - - elif optimize == 'RMSE': - mse_ = metrics.mean_squared_error(y,pred) - rmse_ = np.sqrt(mse_) - score.append(rmse_) - - elif optimize == 'RMSLE': - rmsle = np.sqrt(np.mean(np.power(np.log(np.array(abs(pred))+1) - np.log(np.array(abs(y))+1), 2))) - score.append(rmsle) - - elif optimize == 'MAPE': - - def calculate_mape(actual, prediction): - mask = actual != 0 - return (np.fabs(actual - prediction)/actual)[mask].mean() - - mape = calculate_mape(y,pred) - score.append(mape) - - metric.append(str(optimize)) - - for i in range(1,len(master_df)): - progress.value += 1 - param_grid_val = param_grid[i-1] - - logger.info("Creating Regressor with Fraction = " + str(param_grid_val)) - - monitor.iloc[2,1:] = 'Evaluating Regressor With ' + str(param_grid_val) + ' Fraction' - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - #prepare the dataset for supervised problem - d = master_df[i] - - #cleaning the dataframe for supervised learning - d.dropna(axis=0, inplace=True) - Score_ = pd.DataFrame(d['Score']) - Score = scaler.fit_transform(Score_) - d['Score'] = Score - - if method == 'drop': - d = d[d['Label'] == 0] - d.drop(['Label'], axis=1, inplace=True) - - #get_dummies to caste categorical variable for supervised learning - d = pd.get_dummies(d) - - #split the dataset - X = d.drop(supervised_target, axis=1) - y = d[supervised_target] - - #fit the model - logger.info("Fitting Model") - model.fit(X,y) - - #generate the prediction and evaluate metric - logger.info("Generating Cross Val Predictions") - pred = cross_val_predict(model,X,y,cv=fold, method = 'predict') - - if optimize == 'R2': - r2_ = metrics.r2_score(y,pred) - score.append(r2_) - - elif optimize == 'MAE': - mae_ = metrics.mean_absolute_error(y,pred) - score.append(mae_) - - elif optimize == 'MSE': - mse_ = metrics.mean_squared_error(y,pred) - score.append(mse_) - - elif optimize == 'RMSE': - mse_ = metrics.mean_squared_error(y,pred) - rmse_ = np.sqrt(mse_) - score.append(rmse_) - - elif optimize == 'RMSLE': - rmsle = np.sqrt(np.mean(np.power(np.log(np.array(abs(pred))+1) - np.log(np.array(abs(y))+1), 2))) - score.append(rmsle) - - elif optimize == 'MAPE': - - def calculate_mape(actual, prediction): - mask = actual != 0 - return (np.fabs(actual - prediction)/actual)[mask].mean() - - mape = calculate_mape(y,pred) - score.append(mape) - - metric.append(str(optimize)) - - monitor.iloc[1,1:] = 'Compiling Results' - monitor.iloc[1,1:] = 'Finalizing' - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - logger.info("Creating metrics dataframe") - df = pd.DataFrame({'Fraction': param_grid_with_zero, 'Score' : score, 'Metric': metric}) - df.columns = ['Fraction %', optimize, 'Metric'] - - #sorting to return best model - if optimize == 'R2': - sorted_df = df.sort_values(by=optimize, ascending=False) - else: - sorted_df = df.sort_values(by=optimize, ascending=True) - - ival = sorted_df.index[0] - - best_model = master[ival] - best_model_df = master_df[ival] - best_model_tt = model_fit_time_list[ival] - - logger.info("Rendering Visual") - - fig = px.line(df, x='Fraction %', y=optimize, line_shape='linear', - title= str(full_name) + ' Metrics and Fraction %', color='Metric') - - fig.update_layout(plot_bgcolor='rgb(245,245,245)') - progress.value += 1 - - fig.show() - - logger.info("Visual Rendered Successfully") - - #monitor = '' - - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - monitor_out.clear_output() - progress.close() - - best_k = np.array(sorted_df.head(1)['Fraction %'])[0] - best_m = round(np.array(sorted_df.head(1)[optimize])[0],4) - p = 'Best Model: ' + model_name + ' |' + ' Fraction %: ' + str(best_k) + ' | ' + str(optimize) + ' : ' + str(best_m) - print(p) - - logger.info("Resetting environment to original variables") - org = retain_original(a,b,c,e,z) - - #end runtime - runtime_end = time.time() - runtime = np.array(runtime_end - runtime_start).round(2) - - #mlflow logging - if logging_param: - - logger.info("Creating MLFlow logs") - - #import mlflow - import mlflow - from pathlib import Path - import os - - mlflow.set_experiment(exp_name_log) - - #Creating Logs message monitor - monitor.iloc[1,1:] = 'Creating Logs' - if verbose: - if html_param: - update_display(monitor, display_id = 'monitor') - - mlflow.set_experiment(exp_name_log) - - with mlflow.start_run(run_name=model_name) as run: - - # Get active run to log as tag - RunID = mlflow.active_run().info.run_id - - # Log model parameters - params = best_model.get_params() - - for i in list(params): - v = params.get(i) - if len(str(v)) > 250: - params.pop(i) - - mlflow.log_params(params) - - #set tag of compare_models - mlflow.set_tag("Source", "tune_model") - - import secrets - URI = secrets.token_hex(nbytes=4) - mlflow.set_tag("URI", URI) - mlflow.set_tag("USI", USI) - mlflow.set_tag("Run Time", runtime) - mlflow.set_tag("Run ID", RunID) - - # Log training time in seconds - mlflow.log_metric("TT", best_model_tt) #change this - - # Log plot to html - fig.write_html("Iterations.html") - mlflow.log_artifact('Iterations.html') - os.remove('Iterations.html') - - # Log model and transformation pipeline - logger.info("SubProcess save_model() called ==================================") - save_model(best_model, 'Trained Model', verbose=False) - logger.info("SubProcess save_model() end ==================================") - mlflow.log_artifact('Trained Model' + '.pkl') - size_bytes = Path('Trained Model.pkl').stat().st_size - size_kb = np.round(size_bytes/1000, 2) - mlflow.set_tag("Size KB", size_kb) - os.remove('Trained Model.pkl') - - logger.info(str(best_model)) - logger.info("tune_model() succesfully completed......................................") - - return best_model - -def plot_model(model, - plot = 'tsne', - feature = None, - save = False, #added in pycaret 2.0.0 - system = True): #added in pycaret 2.0.0 - - - """ - - Description: - ------------ - This function takes a trained model object and returns a plot on the dataset - passed during setup stage. This function internally calls assign_model before - generating a plot. - - Example: - -------- - from pycaret.datasets import get_data - anomaly = get_data('anomaly') - experiment_name = setup(data = anomaly, normalize = True) - knn = create_model('knn') - - plot_model(knn) - - Parameters - ---------- - - model : object - A trained model object can be passed. Model must be created using create_model(). - - plot : string, default = 'tsne' - Enter abbreviation of type of plot. The current list of plots supported are: - - Plot Name - ------- ---------- - 'tsne' t-SNE (3d) Dimension Plot - 'umap' UMAP Dimensionality Plot - - feature : string, default = None - feature column is used as a hoverover tooltip. By default, first of column of the - dataset is chosen as hoverover tooltip, when no feature is passed. - - save: Boolean, default = False - Plot is saved as png file in local directory when save parameter set to True. - - system: Boolean, default = True - Must remain True all times. Only to be changed by internal functions. - - Returns: - -------- - - Visual Plot: Prints the visual plot. - ------------ - - """ - - #exception checking - import sys - - import logging - - try: - hasattr(logger, 'name') - except: - logger = logging.getLogger('logs') - logger.setLevel(logging.DEBUG) - - # create console handler and set level to debug - if logger.hasHandlers(): - logger.handlers.clear() - - ch = logging.FileHandler('logs.log') - ch.setLevel(logging.DEBUG) - - # create formatter - formatter = logging.Formatter('%(asctime)s:%(levelname)s:%(message)s') - - # add formatter to ch - ch.setFormatter(formatter) - - # add ch to logger - logger.addHandler(ch) - - logger.info("Initializing plot_model()") - logger.info("""plot_model(model={}, plot={}, feature={}, save={}, system={})""".\ - format(str(model), str(plot), str(feature), str(save), str(system))) - - #ignore warnings - import warnings - warnings.filterwarnings('ignore') - - """ - exception handling starts here - """ - - logger.info("Checking exceptions") - - #plot checking - allowed_plots = ['tsne', 'umap'] - if plot not in allowed_plots: - sys.exit('(Value Error): Plot Not Available. Please see docstring for list of available plots.') - - - - """ - error handling ends here - """ - - logger.info("Importing libraries") - #import dependencies - import pandas as pd - import numpy - - #import cufflinks - import cufflinks as cf - cf.go_offline() - cf.set_config_file(offline=False, world_readable=True) - - logger.info("plot type: " + str(plot)) - - if plot == 'tsne': - - logger.info("SubProcess assign_model() called ==================================") - b = assign_model(model, verbose=False, transformation=True, score=False) - logger.info("SubProcess assign_model() end ==================================") - Label = pd.DataFrame(b['Label']) - b.dropna(axis=0, inplace=True) #droping rows with NA's - b.drop(['Label'], axis=1, inplace=True) - - logger.info("Getting dummies to cast categorical variables") - b = pd.get_dummies(b) #casting categorical variables - - from sklearn.manifold import TSNE - logger.info("Fitting TSNE()") - X_embedded = TSNE(n_components=3).fit_transform(b) - - X = pd.DataFrame(X_embedded) - X['Label'] = Label - - if feature is not None: - X['Feature'] = data_[feature] - else: - X['Feature'] = data_[data_.columns[0]] - - import plotly.express as px - df = X - - logger.info("Rendering Visual") - - fig = px.scatter_3d(df, x=0, y=1, z=2, hover_data=['Feature'], color='Label', title='3d TSNE Plot for Outliers', - opacity=0.7, width=900, height=800) - - - if system: - fig.show() - - logger.info("Visual Rendered Successfully") - - if save: - fig.write_html("TSNE.html") - logger.info("Saving 'TSNE.html' in current active directory") - - elif plot == 'umap': - - logger.info("SubProcess assign_model() called ==================================") - b = assign_model(model, verbose=False, transformation=True, score=False) - logger.info("SubProcess assign_model() end ==================================") - - Label = pd.DataFrame(b['Label']) - b.dropna(axis=0, inplace=True) #droping rows with NA's - b.drop(['Label'], axis=1, inplace=True) - - logger.info("Getting dummies to cast categorical variables") - b = pd.get_dummies(b) #casting categorical variables - - import umap - reducer = umap.UMAP() - logger.info("Fitting UMAP()") - embedding = reducer.fit_transform(b) - X = pd.DataFrame(embedding) - - import plotly.express as px - df = X - df['Label'] = Label - - if feature is not None: - df['Feature'] = data_[feature] - else: - df['Feature'] = data_[data_.columns[0]] - - logger.info("Rendering Visual") - - fig = px.scatter(df, x=0, y=1, - color='Label', title='uMAP Plot for Outliers', hover_data=['Feature'], opacity=0.7, - width=900, height=800) - if system: - fig.show() - - logger.info("Visual Rendered Successfully") - - if save: - fig.write_html("UMAP.html") - logger.info("Saving 'UMAP.html' in current active directory") - - logger.info("plot_model() succesfully completed......................................") - -def save_model(model, model_name, verbose=True): - - """ - - Description: - ------------ - This function saves the transformation pipeline and trained model object - into the current active directory as a pickle file for later use. - - Example: - -------- - from pycaret.datasets import get_data - anomaly = get_data('anomaly') - experiment_name = setup(data = anomaly, normalize = True) - knn = create_model('knn') - - save_model(knn, 'knn_model_23122019') - - This will save the transformation pipeline and model as a binary pickle - file in the current directory. - - Parameters - ---------- - model : object, default = none - A trained model object should be passed. - - model_name : string, default = none - Name of pickle file to be passed as a string. - - verbose : bool, default = True - When set to False, success message is not printed. - - Returns: - -------- - Success Message - - - """ - - import logging - - try: - hasattr(logger, 'name') - except: - logger = logging.getLogger('logs') - logger.setLevel(logging.DEBUG) - - # create console handler and set level to debug - if logger.hasHandlers(): - logger.handlers.clear() - - ch = logging.FileHandler('logs.log') - ch.setLevel(logging.DEBUG) - - # create formatter - formatter = logging.Formatter('%(asctime)s:%(levelname)s:%(message)s') - - # add formatter to ch - ch.setFormatter(formatter) - - # add ch to logger - logger.addHandler(ch) - - logger.info("Initializing save_model()") - logger.info("""save_model(model={}, model_name={}, verbose={})""".\ - format(str(model), str(model_name), str(verbose))) - - #ignore warnings - import warnings - warnings.filterwarnings('ignore') - - logger.info("Appending prep pipeline") - model_ = [] - model_.append(prep_pipe) - model_.append(model) - - import joblib - model_name = model_name + '.pkl' - joblib.dump(model_, model_name) - if verbose: - print('Transformation Pipeline and Model Succesfully Saved') - - logger.info(str(model_name) + ' saved in current working directory') - logger.info(str(model_)) - logger.info("save_model() succesfully completed......................................") - -def load_model(model_name, - platform = None, - authentication = None, - verbose=True): - - """ - - Description: - ------------ - This function loads a previously saved transformation pipeline and model - from the current active directory into the current python environment. - Load object must be a pickle file. - - Example: - -------- - saved_knn = load_model('knn_model_23122019') - - This will load the previously saved model in saved_lr variable. The file - must be in the current directory. - - Parameters - ---------- - model_name : string, default = none - Name of pickle file to be passed as a string. - - platform: string, default = None - Name of platform, if loading model from cloud. Current available options are: - 'aws'. - - authentication : dict - dictionary of applicable authentication tokens. - - When platform = 'aws': - {'bucket' : 'Name of Bucket on S3'} - - verbose: Boolean, default = True - Success message is not printed when verbose is set to False. - - Returns: - -------- - Success Message - - - """ - - #ignore warnings - import warnings - warnings.filterwarnings('ignore') - - #exception checking - import sys - - if platform is not None: - if authentication is None: - sys.exit("(Value Error): Authentication is missing.") - - #cloud provider - if platform == 'aws': - - import boto3 - bucketname = authentication.get('bucket') - filename = str(model_name) + '.pkl' - s3 = boto3.resource('s3') - s3.Bucket(bucketname).download_file(filename, filename) - filename = str(model_name) - model = load_model(filename, verbose=False) - - if verbose: - print('Transformation Pipeline and Model Sucessfully Loaded') - - return model - - import joblib - model_name = model_name + '.pkl' - if verbose: - print('Transformation Pipeline and Model Sucessfully Loaded') - - return joblib.load(model_name) - -def predict_model(model, - data, - platform=None, - authentication=None): - - """ - - Description: - ------------ - This function is used to predict new data using a trained model. It requires a - trained model object created using one of the function in pycaret that returns - a trained model object. New data must be passed to data param as pandas Dataframe. - - Example: - -------- - from pycaret.datasets import get_data - anomaly = get_data('anomaly') - experiment_name = setup(data = anomaly) - knn = create_model('knn') - - knn_predictions = predict_model(model = knn, data = anomaly) - - Parameters - ---------- - model : object / string, default = None - When model is passed as string, load_model() is called internally to load the - pickle file from active directory or cloud platform when platform param is passed. - - data : {array-like, sparse matrix}, shape (n_samples, n_features) where n_samples - is the number of samples and n_features is the number of features. All features - used during training must be present in the new dataset. - - platform: string, default = None - Name of platform, if loading model from cloud. Current available options are: - 'aws'. - - authentication : dict - dictionary of applicable authentication tokens. - - When platform = 'aws': - {'bucket' : 'Name of Bucket on S3'} - - Returns: - -------- - - info grid: Information grid is printed when data is None. - ---------- - - - """ - - #ignore warnings - import warnings - warnings.filterwarnings('ignore') - - #testing - #no active tests - - #general dependencies - from IPython.display import clear_output, update_display - import numpy as np - import pandas as pd - import re - from sklearn import metrics - from copy import deepcopy - import sys - - #copy data and model - data__ = data.copy() - model_ = deepcopy(model) - clear_output() - - if type(model) is str: - if platform == 'aws': - model_ = load_model(str(model), platform='aws', - authentication={'bucket': authentication.get('bucket')}, - verbose=False) - - else: - model_ = load_model(str(model), verbose=False) - - - #separate prep_data pipeline - if type(model_) is list: - prep_pipe_transformer = model_[0] - model = model_[1] - - else: - try: - prep_pipe_transformer = prep_pipe - except: - sys.exit('Transformation Pipeline Missing') - - #exception checking for predict param - if hasattr(model, 'predict'): - pass - else: - sys.exit("(Type Error): Model doesn't support predict parameter.") - - - #predictions start here - _data_ = prep_pipe_transformer.transform(data__) - pred = model.predict(_data_) - pred_score = model.decision_function(_data_) - - data__['Label'] = pred - data__['Score'] = pred_score - - return data__ - -def deploy_model(model, - model_name, - authentication, - platform = 'aws'): - - """ - - Description: - ------------ - (In Preview) - - This function deploys the transformation pipeline and trained model object for - production use. The platform of deployment can be defined under the platform - param along with the applicable authentication tokens which are passed as a - dictionary to the authentication param. - - Example: - -------- - from pycaret.datasets import get_data - anomaly = get_data('anomaly') - experiment_name = setup(data = anomaly, normalize=True) - knn = create_model('knn') - - deploy_model(model = knn, model_name = 'deploy_knn', platform = 'aws', - authentication = {'bucket' : 'pycaret-test'}) - - This will deploy the model on an AWS S3 account under bucket 'pycaret-test' - - For AWS users: - -------------- - Before deploying a model to an AWS S3 ('aws'), environment variables must be - configured using the command line interface. To configure AWS env. variables, - type aws configure in your python command line. The following information is - required which can be generated using the Identity and Access Management (IAM) - portal of your amazon console account: - - - AWS Access Key ID - - AWS Secret Key Access - - Default Region Name (can be seen under Global settings on your AWS console) - - Default output format (must be left blank) - - Parameters - ---------- - model : object - A trained model object should be passed as an estimator. - - model_name : string - Name of model to be passed as a string. - - authentication : dict - dictionary of applicable authentication tokens. - - When platform = 'aws': - {'bucket' : 'Name of Bucket on S3'} - - platform: string, default = 'aws' - Name of platform for deployment. Current available options are: 'aws'. - - Returns: - -------- - Success Message - - - """ - - import sys - import logging - - try: - hasattr(logger, 'name') - except: - logger = logging.getLogger('logs') - logger.setLevel(logging.DEBUG) - - # create console handler and set level to debug - if logger.hasHandlers(): - logger.handlers.clear() - - ch = logging.FileHandler('logs.log') - ch.setLevel(logging.DEBUG) - - # create formatter - formatter = logging.Formatter('%(asctime)s:%(levelname)s:%(message)s') - - # add formatter to ch - ch.setFormatter(formatter) - - # add ch to logger - logger.addHandler(ch) - - logger.info("Initializing deploy_model()") - logger.info("""deploy_model(model={}, model_name={}, authentication={}, platform={})""".\ - format(str(model), str(model_name), str(authentication), str(platform))) - - #checking if awscli available - try: - import awscli - except: - logger.error("awscli library not found. pip install awscli to use deploy_model function.") - sys.exit("awscli library not found. pip install awscli to use deploy_model function.") - - #ignore warnings - import warnings - warnings.filterwarnings('ignore') - - #general dependencies - import ipywidgets as ipw - import pandas as pd - from IPython.display import clear_output, update_display - import os - - if platform == 'aws': - - logger.info("Platform : AWS S3") - - import boto3 - logger.info("Saving model in current working directory") - logger.info("SubProcess save_model() called ==================================") - save_model(model, model_name = model_name, verbose=False) - logger.info("SubProcess save_model() end ==================================") - - #initiaze s3 - logger.info("Initializing S3 client") - s3 = boto3.client('s3') - filename = str(model_name)+'.pkl' - key = str(model_name)+'.pkl' - bucket_name = authentication.get('bucket') - s3.upload_file(filename,bucket_name,key) - clear_output() - os.remove(filename) - print("Model Succesfully Deployed on AWS S3") - logger.info(str(model)) - logger.info("deploy_model() succesfully completed......................................") - -def get_outliers(data, - model = None, - fraction=0.05, - ignore_features = None, - normalize = True, - transformation = False, - pca = False, - pca_components = 0.99, - ignore_low_variance=False, - combine_rare_levels=False, - rare_level_threshold=0.1, - remove_multicollinearity=False, - multicollinearity_threshold=0.9, - n_jobs = None): - - """ - Magic function to get outliers in Power Query / Power BI. - - """ - - if model is None: - model = 'knn' - - if ignore_features is None: - ignore_features_pass = [] - else: - ignore_features_pass = ignore_features - - global X, data_, seed, n_jobs_param, logging_param, logger - - n_jobs_param = n_jobs - - logging_param = False - - import logging - - logger = logging.getLogger('logs') - logger.setLevel(logging.DEBUG) - - # create console handler and set level to debug - if logger.hasHandlers(): - logger.handlers.clear() - - ch = logging.FileHandler('logs.log') - ch.setLevel(logging.DEBUG) - - # create formatter - formatter = logging.Formatter('%(asctime)s:%(levelname)s:%(message)s') - - # add formatter to ch - ch.setFormatter(formatter) - - # add ch to logger - logger.addHandler(ch) - - data_ = data.copy() - - seed = 99 - - from pycaret import preprocess - - X = preprocess.Preprocess_Path_Two(train_data = data, - features_todrop = ignore_features_pass, - display_types = False, - scale_data = normalize, - scaling_method = 'zscore', - Power_transform_data = transformation, - Power_transform_method = 'yj', - apply_pca = pca, - pca_variance_retained_or_number_of_components=pca_components, - apply_zero_nearZero_variance = ignore_low_variance, - club_rare_levels=combine_rare_levels, - rara_level_threshold_percentage=rare_level_threshold, - remove_multicollinearity=remove_multicollinearity, - maximum_correlation_between_features=multicollinearity_threshold, - random_state = seed) - - - - c = create_model(model=model, fraction=fraction, verbose=False, system=False) - - dataset = assign_model(c, verbose=False) - - return dataset - -def models(): - - """ - - Description: - ------------ - Returns table of models available in model library. - - Example - ------- - all_models = models() - - This will return pandas dataframe with all available - models and their metadata. - - """ - - import pandas as pd - - model_id = ['abod', 'iforest', 'cluster', 'cof', 'histogram', 'knn', 'lof', 'svm', 'pca', 'mcd', 'sod', 'sos'] - - model_name = ['Angle-base Outlier Detection', - 'Isolation Forest', - 'Clustering-Based Local Outlier', - 'Connectivity-Based Outlier Factor', - 'Histogram-based Outlier Detection', - 'k-Nearest Neighbors Detector', - 'Local Outlier Factor', - 'One-class SVM detector', - 'Principal Component Analysis', - 'Minimum Covariance Determinant', - 'Subspace Outlier Detection', - 'Stochastic Outlier Selection'] - - model_ref = ['pyod.models.abod.ABOD', - 'pyod.models.iforest', - 'pyod.models.cblof', - 'pyod.models.cof', - 'pyod.models.hbos', - 'pyod.models.knn', - 'pyod.models.lof', - 'pyod.models.ocsvm', - 'pyod.models.pca', - 'pyod.models.mcd', - 'pyod.models.sod', - 'pyod.models.sos'] - - df = pd.DataFrame({'ID' : model_id, - 'Name' : model_name, - 'Reference' : model_ref}) - - df.set_index('ID', inplace=True) - - return df - -def get_logs(experiment_name = None, save = False): - - """ - - Description: - ------------ - Returns a table with experiment logs consisting - run details, parameter, metrics and tags. - - Example - ------- - logs = get_logs() - - This will return pandas dataframe. - - Parameters - ---------- - experiment_name : string, default = None - When set to None current active run is used. - - save : bool, default = False - When set to True, csv file is saved in current directory. - - - """ - - import sys - - if experiment_name is None: - exp_name_log_ = exp_name_log - else: - exp_name_log_ = experiment_name - - import mlflow - from mlflow.tracking import MlflowClient - - client = MlflowClient() - - if client.get_experiment_by_name(exp_name_log_) is None: - sys.exit('No active run found. Check logging parameter in setup or to get logs for inactive run pass experiment_name.') - - exp_id = client.get_experiment_by_name(exp_name_log_).experiment_id - runs = mlflow.search_runs(exp_id) - - if save: - file_name = str(exp_name_log_) + '_logs.csv' - runs.to_csv(file_name, index=False) - - return runs - -def get_config(variable): - - """ - Description: - ------------ - This function is used to access global environment variables. - Following variables can be accessed: - - - X: Transformed dataset - - data_: Original dataset - - seed: random state set through session_id - - prep_pipe: Transformation pipeline configured through setup - - prep_param: prep_param configured through setup - - n_jobs_param: n_jobs parameter used in model training - - html_param: html_param configured through setup - - exp_name_log: Name of experiment set through setup - - logging_param: log_experiment param set through setup - - log_plots_param: log_plots param set through setup - - USI: Unique session ID parameter set through setup - - Example: - -------- - X = get_config('X') - - This will return transformed dataset. - - - """ - - import logging - - try: - hasattr(logger, 'name') - except: - logger = logging.getLogger('logs') - logger.setLevel(logging.DEBUG) - - # create console handler and set level to debug - if logger.hasHandlers(): - logger.handlers.clear() - - ch = logging.FileHandler('logs.log') - ch.setLevel(logging.DEBUG) - - # create formatter - formatter = logging.Formatter('%(asctime)s:%(levelname)s:%(message)s') - - # add formatter to ch - ch.setFormatter(formatter) - - # add ch to logger - logger.addHandler(ch) - - logger.info("Initializing get_config()") - logger.info("""get_config(variable={})""".\ - format(str(variable))) - - if variable == 'X': - global_var = X - - if variable == 'data_': - global_var = data_ - - if variable == 'seed': - global_var = seed - - if variable == 'prep_pipe': - global_var = prep_pipe - - if variable == 'prep_param': - global_var = prep_param - - if variable == 'n_jobs_param': - global_var = n_jobs_param - - if variable == 'html_param': - global_var = html_param - - if variable == 'exp_name_log': - global_var = exp_name_log - - if variable == 'logging_param': - global_var = logging_param - - if variable == 'log_plots_param': - global_var = log_plots_param - - if variable == 'USI': - global_var = USI - - logger.info("Global variable: " + str(variable) + ' returned') - logger.info("get_config() succesfully completed......................................") - - return global_var - -def set_config(variable,value): - - """ - Description: - ------------ - This function is used to reset global environment variables. - Following variables can be accessed: - - - X: Transformed dataset - - data_: Original dataset - - seed: random state set through session_id - - prep_pipe: Transformation pipeline configured through setup - - prep_param: prep_param configured through setup - - n_jobs_param: n_jobs parameter used in model training - - html_param: html_param configured through setup - - exp_name_log: Name of experiment set through setup - - logging_param: log_experiment param set through setup - - log_plots_param: log_plots param set through setup - - USI: Unique session ID parameter set through setup - - Example: - -------- - set_config('seed', 123) - - This will set the global seed to '123'. - - """ - - import logging - - try: - hasattr(logger, 'name') - except: - logger = logging.getLogger('logs') - logger.setLevel(logging.DEBUG) - - # create console handler and set level to debug - if logger.hasHandlers(): - logger.handlers.clear() - - ch = logging.FileHandler('logs.log') - ch.setLevel(logging.DEBUG) - - # create formatter - formatter = logging.Formatter('%(asctime)s:%(levelname)s:%(message)s') - - # add formatter to ch - ch.setFormatter(formatter) - - # add ch to logger - logger.addHandler(ch) - - logger.info("Initializing set_config()") - logger.info("""set_config(variable={}, value={})""".\ - format(str(variable), str(value))) - - if variable == 'X': - global X - X = value - - if variable == 'data_': - global data_ - data_ = value - - if variable == 'seed': - global seed - seed = value - - if variable == 'prep_pipe': - global prep_pipe - prep_pipe = value - - if variable == 'prep_param': - global prep_param - prep_param = value - - if variable == 'n_jobs_param': - global n_jobs_param - n_jobs_param = value - - if variable == 'html_param': - global html_param - html_param = value - - if variable == 'exp_name_log': - global exp_name_log - exp_name_log = value - - if variable == 'logging_param': - global logging_param - logging_param = value - - if variable == 'log_plots_param': - global log_plots_param - log_plots_param = value - - if variable == 'USI': - global USI - USI = value - - logger.info("Global variable: " + str(variable) + ' updated') - logger.info("set_config() succesfully completed......................................") - -def get_system_logs(): - - """ - Read and print 'logs.log' file from current active directory - """ - - file = open('logs.log', 'r') - lines = file.read().splitlines() - file.close() - - for line in lines: - if not line: - continue - - columns = [col.strip() for col in line.split(':') if col] - print(columns) \ No newline at end of file -- GitLab