Add files via upload

39b8adbc · PyCaret · GitHub · eaf9a7fc · 39b8adbc · 39b8adbc
7 changed file
--- a/README.md
+++ b/README.md
@@ -2,10 +2,10 @@
 PyCaret is end-to-end open source machine learning library for python programming language. Its primary objective is to reduce the cycle time of hypothesis to insights by providing an easy to use high level unified API. PyCaret's vision is to become defacto standard for teaching machine learning and data science. Our strength is in our easy to use unified interface for both supervised and unsupervised learning. It saves time and effort that citizen data scientists, students and researchers spent on coding or learning to code using different interfaces, so that now they can focus on business problem.

 ## Current Release
-The current release is beta 0.0.33 (as of 04/02/2020). A full release is targetted in the first week of February 2020.
+The current release is beta 0.0.34 (as of 05/02/2020). A full release is targetted in the first week of February 2020.

 ## Features Currently Available
-As per beta 0.0.33 following modules are generally available:
+As per beta 0.0.34 following modules are generally available:
 * pycaret.datasets <br/>
 * pycaret.classification (binary and multiclass) <br/>
 * pycaret.regression <br/>
@@ -31,7 +31,7 @@ pip install pycaret
 ```

 ## Quick Start
-As of beta 0.0.33 classification, regression, nlp, arules, anomaly and clustering modules are available.
+As of beta 0.0.34 classification, regression, nlp, arules, anomaly and clustering modules are available.

 ### Classification / Regression


--- a/anomaly.py
+++ b/anomaly.py
@@ -3,12 +3,13 @@
 # License: MIT


+
 def setup(data, 
          categorical_features = None,
          categorical_imputation = 'constant',
          ordinal_features = None, #new
          numeric_features = None,
-          numeric_imputation = 'mean',
+          numeric_imputation = 'mean',m
          date_features = None,
          ignore_features = None,
          normalize = False,
@@ -194,8 +195,8 @@ def setup(data,
    remove_multicollinearity: bool, default = False
    When set to True, the variables with inter-correlations higher than the threshold
    defined under the multicollinearity_threshold param are dropped. When two features
-    are highly correlated with each other, the feature with less average correlation in 
-    the feature space is dropped. 
+    are highly correlated with each other, the feature with higher average correlation 
+    in the feature space is dropped. 
    
    multicollinearity_threshold: float, default = 0.9
    Threshold used for dropping the correlated features. Only comes into effect when 
@@ -434,6 +435,10 @@ def setup(data,
    from IPython.display import display, HTML, clear_output, update_display
    import datetime, time
    
+    #pandas option
+    pd.set_option('display.max_columns', 500)
+    pd.set_option('display.max_rows', 500)
+    
    #progress bar
    max_steps = 4
        
@@ -753,8 +758,8 @@ def setup(data,
    """
    
    #reset pandas option
-    #pd.reset_option("display.max_rows") 
-    #pd.reset_option("display.max_columns")
+    pd.reset_option("display.max_rows") 
+    pd.reset_option("display.max_columns")
    
    #create an empty list for pickling later.
    if supervised is False:

--- a/classification.py
+++ b/classification.py
@@ -613,6 +613,10 @@ def setup(data,
    import ipywidgets as ipw
    from IPython.display import display, HTML, clear_output, update_display
    import datetime, time
+    
+    #pandas option
+    pd.set_option('display.max_columns', 500)
+    pd.set_option('display.max_rows', 500)
   
    #progress bar
    if sampling:
@@ -1006,8 +1010,8 @@ def setup(data,
    """
    
    #reset pandas option
-    #pd.reset_option("display.max_rows") 
-    #pd.reset_option("display.max_columns")
+    pd.reset_option("display.max_rows") 
+    pd.reset_option("display.max_columns")
      
    #create an empty list for pickling later.
    experiment__ = []
@@ -1453,7 +1457,6 @@ def setup(data,



-
 def create_model(estimator = None, 
                 ensemble = False, 
                 method = None, 
@@ -5069,8 +5072,8 @@ def stack_models(estimator_list,
                 meta_model = None, 
                 fold = 10,
                 round = 4, 
-                 method = 'hard', 
-                 restack = False, 
+                 method = 'soft', 
+                 restack = True, 
                 plot = False,
                 finalize = False,
                 verbose = True):
@@ -5120,11 +5123,11 @@ def stack_models(estimator_list,
    round: integer, default = 4
    Number of decimal places the metrics in the score grid will be rounded to.

-    method: string, default = 'hard'
-    'hard', uses predicted class labels as an input to the meta model. 
+    method: string, default = 'soft'
    'soft', uses predicted probabilities as an input to the meta model.
+    'hard', uses predicted class labels as an input to the meta model. 

-    restack: Boolean, default = False
+    restack: Boolean, default = True
    When restack is set to True, raw data will be exposed to meta model when
    making predictions, otherwise when False, only the predicted label or
    probabilities is passed to meta model when making final predictions.
@@ -5236,6 +5239,7 @@ def stack_models(estimator_list,
    from IPython.display import display, HTML, clear_output, update_display
    import time, datetime
    from copy import deepcopy
+    from sklearn.base import clone
    
    #copy estimator_list
    estimator_list = deepcopy(estimator_list)
@@ -5606,8 +5610,8 @@ def create_stacknet(estimator_list,
                    meta_model = None,
                    fold = 10,
                    round = 4,
-                    method = 'hard',
-                    restack = False,
+                    method = 'soft',
+                    restack = True,
                    finalize = False,
                    verbose = True):
    
@@ -5654,11 +5658,11 @@ def create_stacknet(estimator_list,
    round: integer, default = 4
    Number of decimal places the metrics in the score grid will be rounded to.
  
-    method: string, default = 'hard'
-    'hard', uses predicted class labels as an input to the meta model. 
+    method: string, default = 'soft'
    'soft', uses predicted probabilities as an input to the meta model.
+    'hard', uses predicted class labels as an input to the meta model. 
    
-    restack: Boolean, default = False
+    restack: Boolean, default = True
    When restack is set to True, raw data and prediction of all layers will be 
    exposed to the meta model when making predictions. When set to False, only 
    the predicted label or probabilities of last layer is passed to meta model 
@@ -5709,7 +5713,7 @@ def create_stacknet(estimator_list,
    '''
    
    #testing
-    #no active test
+    #global inter_level_names
    
    #exception checking   
    import sys
@@ -5717,13 +5721,17 @@ def create_stacknet(estimator_list,
    #checking estimator_list
    if type(estimator_list[0]) is not list:
        sys.exit("(Type Error): estimator_list parameter must be list of list. ")
-
+        
+    #blocking stack_models usecase
+    if len(estimator_list) == 1:
+        sys.exit("(Type Error): Single Layer stacking must be performed using stack_models(). ")
+        
    #checking error for estimator_list
    for i in estimator_list:
        for j in i:
            if 'sklearn' not in str(type(j)) and 'CatBoostClassifier' not in str(type(j)):
                sys.exit("(Value Error): estimator_list parameter only trained model object")
-            
+    
    #checking meta model
    if meta_model is not None:
        if 'sklearn' not in str(type(meta_model)) and 'CatBoostClassifier' not in str(type(meta_model)):
@@ -5767,6 +5775,7 @@ def create_stacknet(estimator_list,
    from IPython.display import display, HTML, clear_output, update_display
    import time, datetime
    from copy import deepcopy
+    from sklearn.base import clone
    
    #copy estimator_list
    estimator_list = deepcopy(estimator_list)
@@ -5845,8 +5854,13 @@ def create_stacknet(estimator_list,
   
    #defining inter_level names
    for item in inter_level:
+        level_list=[]
        for m in item:
-            inter_level_names = np.append(inter_level_names, str(m).split("(")[0])    
+            if 'CatBoostClassifier' in str(m).split("(")[0]:
+                level_list.append('CatBoostClassifier')
+            else:
+                level_list.append(str(m).split("(")[0])
+        inter_level_names.append(level_list)
    
    #defining data_X and data_y
    if finalize:
@@ -5931,15 +5945,15 @@ def create_stacknet(estimator_list,
            MONITOR UPDATE STARTS
            '''

-            monitor.iloc[1,1:] = 'Evaluating ' + inter_level_names[inter_counter]
+            monitor.iloc[1,1:] = 'Evaluating ' + inter_level_names[inter_counter][model_counter]
            update_display(monitor, display_id = 'monitor')

            '''
            MONITOR UPDATE ENDS
            '''
            
-            model = model.fit(X = base_array_df, y = data_y) #changed to data_y
-            inter_inner.append(model)
+            model = clone(model)
+            inter_inner.append(model.fit(X = base_array_df, y = data_y)) #changed to data_y
            
            if method == 'soft':
                try:
@@ -5983,6 +5997,7 @@ def create_stacknet(estimator_list,
            base_array_df = base_array_df.iloc[:,i:]
        
        inter_counter += 1
+        progress.value += 1
        
    model = meta_model
    
@@ -6180,7 +6195,7 @@ def create_stacknet(estimator_list,
    
    else:
        clear_output()
-        return models_ 
+        return models_  



@@ -7295,7 +7310,7 @@ def predict_model(estimator,
    """
    
    #testing
-    #no active tests
+    #global base_pred_df, base_pred_df_no_restack, df, df_restack, stacker_method, combined_df, inter_pred_df
    
    #ignore warnings
    import warnings
@@ -7437,7 +7452,7 @@ def predict_model(estimator,
            """
            base_pred = []
            for i in stacker_base:
-                if stacker_method == 'soft':
+                if 'soft' in stacker_method:
                    try:
                        a = i.predict_proba(Xtest) #change
                        a = a[:,1]
@@ -7456,7 +7471,7 @@ def predict_model(estimator,
            
            base_pred_df_no_restack = base_pred_df.copy()
            base_pred_df = pd.concat([Xtest,base_pred_df], axis=1)
-            
+

            """
            inter level predictions
@@ -7468,15 +7483,16 @@ def predict_model(estimator,
            inter_counter = 0

            for level in stacker:
-
+                
                inter_pred_df = pd.DataFrame()

                model_counter = 0 

                for model in level:
+                    
                    try:
                        if inter_counter == 0:
-                            if stacker_method == 'soft':
+                            if 'soft' in stacker_method: #changed
                                try:
                                    p = model.predict_proba(base_pred_df)
                                    p = p[:,1]
@@ -7495,7 +7511,7 @@ def predict_model(estimator,
                                except:
                                    p = model.predict(base_pred_df_no_restack)
                        else:
-                            if stacker_method == 'soft':
+                            if 'soft' in stacker_method:
                                try:
                                    p = model.predict_proba(last_level_df)
                                    p = p[:,1]
@@ -7504,14 +7520,15 @@ def predict_model(estimator,
                            else:
                                p = model.predict(last_level_df)
                    except:
-                        if stacker_method == 'soft':
+                        if 'soft' in stacker_method:
                            try:
                                p = model.predict_proba(combined_df)
                                p = p[:,1]
                            except:
-                                p = model.predict(combined_df)
+                                p = model.predict(combined_df)        
+                    
                    p = pd.DataFrame(p)
-
+                    
                    col = str(model).split("(")[0]
                    if 'CatBoostClassifier' in col:
                        col = 'CatBoostClassifier'
@@ -7533,6 +7550,7 @@ def predict_model(estimator,
            """

            #final meta predictions
+            
            try:
                pred_ = stacker_meta.predict(combined_df)
            except:
@@ -7644,6 +7662,7 @@ def predict_model(estimator,
                    p = i.predict(Xtest) #change

                else:
+                    
                    try:
                        p = i.predict_proba(Xtest) #change
                        p = p[:,1]
@@ -7667,7 +7686,7 @@ def predict_model(estimator,
            
            df.fillna(value=0,inplace=True)
            df_restack.fillna(value=0,inplace=True)
-
+            
            #restacking check
            try:
                pred_ = meta_model.predict(df)
@@ -7684,7 +7703,7 @@ def predict_model(estimator,
                    pred_prob = pred_prob[:,1]
                except:
                    pass
-
+            
            if data is None:
                
                sca = metrics.accuracy_score(ytest,pred_)
@@ -7815,6 +7834,7 @@ def predict_model(estimator,
    return X_test_


+
 def deploy_model(model, 
                 model_name, 
                 authentication,

--- a/clustering.py
+++ b/clustering.py
@@ -195,8 +195,8 @@ def setup(data,
    remove_multicollinearity: bool, default = False
    When set to True, the variables with inter-correlations higher than the threshold
    defined under the multicollinearity_threshold param are dropped. When two features
-    are highly correlated with each other, the feature with less average correlation in 
-    the feature space is dropped. 
+    are highly correlated with each other, the feature with higher average correlation 
+    in the feature space is dropped. 
    
    multicollinearity_threshold: float, default = 0.9
    Threshold used for dropping the correlated features. Only comes into effect when 
@@ -435,6 +435,10 @@ def setup(data,
    from IPython.display import display, HTML, clear_output, update_display
    import datetime, time
    
+    #pandas option
+    pd.set_option('display.max_columns', 500)
+    pd.set_option('display.max_rows', 500)
+    
    #progress bar
    max_steps = 4
        
@@ -754,8 +758,8 @@ def setup(data,
    """
    
    #reset pandas option
-    #pd.reset_option("display.max_rows") 
-    #pd.reset_option("display.max_columns")
+    pd.reset_option("display.max_rows") 
+    pd.reset_option("display.max_columns")
    
    #create an empty list for pickling later.
    if supervised is False:

--- a/regression.py
+++ b/regression.py
@@ -635,7 +635,11 @@ def setup(data,
    import ipywidgets as ipw
    from IPython.display import display, HTML, clear_output, update_display
    import datetime, time
-   
+
+    #pandas option
+    pd.set_option('display.max_columns', 500)
+    pd.set_option('display.max_rows', 500)
+    
    #progress bar
    if sampling:
        max = 10 + 3
@@ -1045,8 +1049,8 @@ def setup(data,
    """
    
    #reset pandas option
-    #pd.reset_option("display.max_rows") #switch back on 
-    #pd.reset_option("display.max_columns")
+    pd.reset_option("display.max_rows") #switch back on 
+    pd.reset_option("display.max_columns")
    
    #create an empty list for pickling later.
    experiment__ = []
@@ -4555,12 +4559,11 @@ def tune_model(estimator = None,



-
 def stack_models(estimator_list, 
                 meta_model = None, 
                 fold = 10,
                 round = 4, 
-                 restack = False, 
+                 restack = True, 
                 plot = False,
                 finalize = False,
                 verbose = True):
@@ -4611,7 +4614,7 @@ def stack_models(estimator_list,
    round: integer, default = 4
    Number of decimal places the metrics in the score grid will be rounded to.

-    restack: Boolean, default = False
+    restack: Boolean, default = True
    When restack is set to True, raw data will be exposed to meta model when
    making predictions, otherwise when False, only the predicted label is passed 
    to meta model when making final predictions.
@@ -5049,12 +5052,11 @@ def stack_models(estimator_list,



-
 def create_stacknet(estimator_list,
                    meta_model = None,
                    fold = 10,
                    round = 4,
-                    restack = False,
+                    restack = True,
                    finalize = False,
                    verbose = True):
    
@@ -5100,7 +5102,7 @@ def create_stacknet(estimator_list,
    round: integer, default = 4
    Number of decimal places the metrics in the score grid will be rounded to.
  
-    restack: Boolean, default = False
+    restack: Boolean, default = True
    When restack is set to True, raw data and prediction of all layers will be 
    exposed to the meta model when making predictions. When set to False, only 
    the predicted label of last layer is passed to meta model when making final 
@@ -5143,7 +5145,7 @@ def create_stacknet(estimator_list,
    '''
    
    #for checking only
-    #No active test
+    global inter_level_names
    
    #exception checking   
    import sys
@@ -5151,7 +5153,11 @@ def create_stacknet(estimator_list,
    #checking estimator_list
    if type(estimator_list[0]) is not list:
        sys.exit("(Type Error): estimator_list parameter must be list of list. ")
-
+        
+    #blocking stack_models usecase
+    if len(estimator_list) == 1:
+        sys.exit("(Type Error): Single Layer stacking must be performed using stack_models(). ")
+        
    #checking error for estimator_list
    for i in estimator_list:
        for j in i:
@@ -5185,12 +5191,15 @@ def create_stacknet(estimator_list,
    
    '''
    
+    global inter_level_names
+    
    #pre-load libraries
    import pandas as pd
    import ipywidgets as ipw
    from IPython.display import display, HTML, clear_output, update_display
    import time, datetime
    from copy import deepcopy
+    from sklearn.base import clone
    
    #copy estimator_list
    estimator_list = deepcopy(estimator_list)
@@ -5238,7 +5247,6 @@ def create_stacknet(estimator_list,

    progress.value += 1
    
-    
    base_level = estimator_list[0]
    base_level_names = []
    
@@ -5273,13 +5281,14 @@ def create_stacknet(estimator_list,
    
    #defining inter_level names
    for item in inter_level:
+        level_list=[]
        for m in item:
-            inter_level_names = np.append(inter_level_names, str(m).split("(")[0])    
+            if 'CatBoostRegressor' in str(m).split("(")[0]:
+                level_list.append('CatBoostRegressor')
+            else:
+                level_list.append(str(m).split("(")[0])
+        inter_level_names.append(level_list)
            
-    #defining inter_level names
-    for item in inter_level:
-        for m in item:
-            inter_level_names = np.append(inter_level_names, str(m).split("(")[0])  
            
    #defining data_X and data_y
    if finalize:
@@ -5350,15 +5359,17 @@ def create_stacknet(estimator_list,
            MONITOR UPDATE STARTS
            '''

-            monitor.iloc[1,1:] = 'Evaluating ' + inter_level_names[inter_counter]
+            monitor.iloc[1,1:] = 'Evaluating ' + inter_level_names[inter_counter][model_counter]
            update_display(monitor, display_id = 'monitor')

            '''
            MONITOR UPDATE ENDS
            '''
            
-            model = model.fit(X = base_array_df, y = data_y) #changed to data_y
-            inter_inner.append(model)
+            model = clone(model)
+            inter_inner.append(model.fit(X = base_array_df, y = data_y)) #changed to data_y 
+            #model = model.fit(X = base_array_df, y = data_y) #changed to data_y
+            #inter_inner.append(model)
            
            base_array = cross_val_predict(model,X = base_array_df, y = data_y,cv=fold, method='predict')
            base_array = pd.DataFrame(base_array)
@@ -5391,6 +5402,7 @@ def create_stacknet(estimator_list,
            base_array_df = base_array_df.iloc[:,i:]
        
        inter_counter += 1
+        progress.value += 1
    
    model = meta_model
    
@@ -5587,6 +5599,7 @@ def create_stacknet(estimator_list,



+
 def plot_model(estimator, 
               plot = 'residuals'): 
    
@@ -6467,6 +6480,7 @@ def load_experiment(experiment_name):



+
 def predict_model(estimator, 
                  data=None,
                  platform=None,
@@ -6992,6 +7006,7 @@ def predict_model(estimator,



+
 def deploy_model(model, 
                 model_name, 
                 authentication,

--- a/requirements.txt
+++ b/requirements.txt
@@ -28,5 +28,4 @@ datefinder==0.7.0
 datetime
 DateTime==4.3
 tqdm==4.36.1
-awscli
-boto3
\ No newline at end of file
+awscli
\ No newline at end of file
--- a/setup.py
+++ b/setup.py
@@ -27,7 +27,7 @@ def readme():

 setup(
    name="pycaret",
-    version="0.0.33",
+    version="0.0.34",
    description="A Python package for supervised and unsupervised machine learning.",
    long_description=readme(),
    long_description_content_type="text/markdown",
@@ -47,5 +47,5 @@ setup(
                     "wordcloud", "textblob", "plotly==4.4.1", "cufflinks==0.17.0", "umap-learn",
                     "lightgbm==2.3.1", "pyLDAvis", "gensim", "spacy", "nltk", "mlxtend",
                     "pyod", "catboost==0.20.2", "pandas-profiling==2.3.0", "kmodes==0.10.1",
-                     "datefinder==0.7.0", "datetime", "DateTime==4.3", "tqdm==4.36.1", "awscli", "boto3"]
+                     "datefinder==0.7.0", "datetime", "DateTime==4.3", "tqdm==4.36.1", "awscli"]
 )
\ No newline at end of file