Allow user parameters for shuffle (#87)

* fix typo in classification expert tutorial (#46) * Fix some typos (#41) * Add files via upload * fix shuffle #68 - train_test_split - shuffle parameter: default True - classification + regression * fix shuffle #12 - kfold + stratified kfold - shuffle parameter: default False - classification + regression * small fixes: stratify - stratify set to None - Updated docs Co-authored-by: N Pratik Kumar <pr2tik1@gmail.com> Co-authored-by: N Arsen Poghosyan <arsen.v.poghosyan@gmail.com> Co-authored-by: N PyCaret <moez@pycaret.org>

Allow user parameters for shuffle (#87)
* fix typo in classification expert tutorial (#46) * Fix some typos (#41) * Add files via upload * fix shuffle #68 - train_test_split - shuffle parameter: default True - classification + regression * fix shuffle #12 - kfold + stratified kfold - shuffle parameter: default False - classification + regression * small fixes: stratify - stratify set to None - Updated docs Co-authored-by: N Pratik Kumar <pr2tik1@gmail.com> Co-authored-by: N Arsen Poghosyan <arsen.v.poghosyan@gmail.com> Co-authored-by: N PyCaret <moez@pycaret.org>
fe3c58e5 · Nishant Rodrigues · GitHub · ba0146d4 · fe3c58e5 · fe3c58e5
6 changed file
--- a/Releases/pycaret 1.0.1 - development.txt
+++ b/Releases/pycaret 1.0.1 - development.txt
+In addition to bug fix, following upgrades are being considered for 1.0.1 release. To upvote, please email moez@pycaret.org.
+
+- Return models from compare_models. Currently compare_models() donot return any trained model object. (Impact: pycaret.classification, pycaret.regression)
+- tune_model() function to work with model object directly. In current version string parameter is passed for example tune_model('lr'). In 1.0.1 this will be changed to tune_model(lr) where lr will be an object created using create_model (Impact: pycaret.classification, pycaret.regression)
+- Allow custom tuning grids to be passed into tune_model function. Currently user cannot pass custom grid. (Impact: pycaret.classification, pycaret.regression)
+- Add matthews_corrcoef, log_loss metrics in classification. (Impact: pycaret.classification)
+- Add shuffle parameter for train-test-split in setup() function (Impact: pycaret.classification, pycaret.regression)
+- Add shuffle parameter for Kfold in setup() function (Impact: pycaret.classification, pycaret.regression)
--- a/Tutorials/.ipynb_checkpoints/Binary Classification Tutorial Level Expert - CLF103-checkpoint.ipynb
+++ b/Tutorials/.ipynb_checkpoints/Binary Classification Tutorial Level Expert - CLF103-checkpoint.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#  <span style=\"color:orange\">Binary Classification Tutorial (CLF103) - Level Expert</span>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Date Updated: Feb 25, 2020**\n",
+    "\n",
+    "# Work in progress\n",
+    "We are currently working on this tutorial. Please check back soon! \n",
+    "\n",
+    "### In the mean time, you can see: \n",
+    "- __[Binary Classification Tutorial (CLF101) - Level Beginner](https://github.com/pycaret/pycaret/blob/master/Tutorials/Binary%20Classification%20Tutorial%20Level%20Beginner%20-%20%20CLF101.ipynb)__\n",
+    "- __[Binary Classification Tutorial (CLF102) - Level Intermediate](https://github.com/pycaret/pycaret/blob/master/Tutorials/Binary%20Classification%20Tutorial%20Level%20Intermediate%20-%20CLF102.ipynb)__"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
--- a/Tutorials/Binary Classification Tutorial Level Expert - CLF103.ipynb
+++ b/Tutorials/Binary Classification Tutorial Level Expert - CLF103.ipynb
@@ -17,8 +17,8 @@
    "We are currently working on this tutorial. Please check back soon! \n",
    "\n",
    "### In the mean time, you can see: \n",
-    "- __[Binary Classification Tutorial (CLF101) - Level Beginner](https://github.com/pycaret/pycaret/blob/master/Tutorials/BinaryClassificationTutorial(CLF101)_LevelBeginner.ipynb)__\n",
-    "- __[Binary Classification Tutorial (CLF102) - Level Intermediate](https://github.com/pycaret/pycaret/blob/master/Tutorials/BinaryClassificationTutorial(CLF102)_LevelIntermediate.ipynb)__"
+    "- __[Binary Classification Tutorial (CLF101) - Level Beginner](https://github.com/pycaret/pycaret/blob/master/Tutorials/Binary%20Classification%20Tutorial%20Level%20Beginner%20-%20%20CLF101.ipynb)__\n",
+    "- __[Binary Classification Tutorial (CLF102) - Level Intermediate](https://github.com/pycaret/pycaret/blob/master/Tutorials/Binary%20Classification%20Tutorial%20Level%20Intermediate%20-%20CLF102.ipynb)__"
   ]
  }
 ],
@@ -42,5 +42,5 @@
  }
 },
 "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }
--- a/Tutorials/Regression Tutorial Level Beginner - REG101.ipynb
+++ b/Tutorials/Regression Tutorial Level Beginner - REG101.ipynb
--- a/classification.py
+++ b/classification.py
@@ -48,7 +48,9 @@ def setup(data,
          feature_ratio = False, #new                        #create checking exceptions and docstring
          interaction_threshold = 0.01,    #new              #create checking exceptions and docstring
          session_id = None,
-          silent=False,
+          silent = False,
+          data_split_shuffle = True, #new
+          folds_shuffle = False, #new
          profile = False):
    
    """
@@ -351,7 +353,14 @@ def setup(data,
    When set to True, confirmation of data types is not required. All preprocessing will 
    be performed assuming automatically inferred data types. Not recommended for direct use 
    except for established pipelines.
-    
+
+    data_split_shuffle: bool, default = True
+    If set to False, prevents shuffling of rows when splitting data. If set to `False`
+    stratify updated to `None`
+
+    folds_shuffle: bool, default = True
+    If set to False, prevents shuffling of rows when using cross validation
+
    profile: bool, default = False
    If set to true, a data profile for Exploratory Data Analysis will be displayed 
    in an interactive HTML report. 
@@ -1130,8 +1139,8 @@ def setup(data,
            MONITOR UPDATE ENDS
            '''
    
-            X_, X__, y_, y__ = train_test_split(X, y, test_size=1-i, stratify=y, random_state=seed)
-            X_train, X_test, y_train, y_test = train_test_split(X_, y_, test_size=0.3, stratify=y_, random_state=seed)
+            X_, X__, y_, y__ = train_test_split(X, y, test_size=1-i, stratify=y if data_split_shuffle else None, random_state=seed, shuffle=data_split_shuffle)
+            X_train, X_test, y_train, y_test = train_test_split(X_, y_, test_size=0.3, stratify=y_ if data_split_shuffle else None, random_state=seed, shuffle=data_split_shuffle)
            model.fit(X_train,y_train)
            pred_ = model.predict(X_test)
            try:
@@ -1254,8 +1263,8 @@ def setup(data,
        sample_size = input("Sample Size: ")
        
        if sample_size == '' or sample_size == '1':
-            
-            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-train_size, stratify=y, random_state=seed)
+            stratify = y if data_split_shuffle else None
+            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-train_size, stratify=stratify, random_state=seed, shuffle=data_split_shuffle)
            
            '''
            Final display Starts
@@ -1341,11 +1350,11 @@ def setup(data,
        else:
            
            sample_n = float(sample_size)
-            X_selected, X_discard, y_selected, y_discard = train_test_split(X, y, test_size=1-sample_n, stratify=y, 
-                                                                random_state=seed)
+            X_selected, X_discard, y_selected, y_discard = train_test_split(X, y, test_size=1-sample_n, stratify=y if data_split_shuffle else None, 
+                                                                random_state=seed, shuffle=data_split_shuffle)
            
-            X_train, X_test, y_train, y_test = train_test_split(X_selected, y_selected, test_size=1-train_size, stratify=y_selected, 
-                                                                random_state=seed)
+            X_train, X_test, y_train, y_test = train_test_split(X_selected, y_selected, test_size=1-train_size, stratify=y_selected if data_split_shuffle else None, 
+                                                                random_state=seed, shuffle=data_split_shuffle)
            clear_output()
            
            
@@ -1436,7 +1445,8 @@ def setup(data,
        
        monitor.iloc[1,1:] = 'Splitting Data'
        update_display(monitor, display_id = 'monitor')
-        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-train_size, stratify=y, random_state=seed)
+        stratify = y if data_split_shuffle else None
+        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-train_size, stratify=stratify, random_state=seed, shuffle=data_split_shuffle)
        progress.value += 1
        
        clear_output()
@@ -1730,7 +1740,7 @@ def create_model(estimator = None,
    progress.value += 1
    
    #cross validation setup starts here
-    kf = StratifiedKFold(fold, random_state=seed)
+    kf = StratifiedKFold(fold, random_state=seed, shuffle=folds_shuffle)

    score_auc =np.empty((0,0))
    score_acc =np.empty((0,0))
@@ -2345,7 +2355,7 @@ def ensemble_model(estimator,
    MONITOR UPDATE ENDS
    '''
    
-    kf = StratifiedKFold(fold, random_state=seed)
+    kf = StratifiedKFold(fold, random_state=seed, shuffle=folds_shuffle)
    
    score_auc =np.empty((0,0))
    score_acc =np.empty((0,0))
@@ -3412,7 +3422,7 @@ def compare_models(blacklist = None,
    '''
    
    #cross validation setup starts here
-    kf = StratifiedKFold(fold, random_state=seed)
+    kf = StratifiedKFold(fold, random_state=seed, shuffle=folds_shuffle)

    score_acc =np.empty((0,0))
    score_auc =np.empty((0,0))
@@ -3902,7 +3912,7 @@ def tune_model(estimator = None,
        
    progress.value += 1
    
-    kf = StratifiedKFold(fold, random_state=seed)
+    kf = StratifiedKFold(fold, random_state=seed, shuffle=folds_shuffle)

    score_auc =np.empty((0,0))
    score_acc =np.empty((0,0))
@@ -4859,7 +4869,7 @@ def blend_models(estimator_list = 'All',
    
    

-    kf = StratifiedKFold(fold, random_state=seed)
+    kf = StratifiedKFold(fold, random_state=seed, shuffle=folds_shuffle)
    
    '''
    MONITOR UPDATE STARTS
@@ -5564,7 +5574,7 @@ def stack_models(estimator_list,
    model.fit(data_X, data_y)
    models_.append(model)
    
-    kf = StratifiedKFold(fold, random_state=seed) #capturing fold requested by user
+    kf = StratifiedKFold(fold, random_state=seed, shuffle=folds_shuffle) #capturing fold requested by user

    score_auc =np.empty((0,0))
    score_acc =np.empty((0,0))
@@ -6188,7 +6198,7 @@ def create_stacknet(estimator_list,
    
    meta_model_ = model.fit(data_X,data_y)
    
-    kf = StratifiedKFold(fold, random_state=seed) #capturing fold requested by user
+    kf = StratifiedKFold(fold, random_state=seed, shuffle=folds_shuffle) #capturing fold requested by user

    score_auc =np.empty((0,0))
    score_acc =np.empty((0,0))
@@ -6760,7 +6770,7 @@ def calibrate_model(estimator,
    progress.value += 1
    
    #cross validation setup starts here
-    kf = StratifiedKFold(fold, random_state=seed)
+    kf = StratifiedKFold(fold, random_state=seed, shuffle=folds_shuffle)

    score_auc =np.empty((0,0))
    score_acc =np.empty((0,0))

--- a/regression.py
+++ b/regression.py
@@ -4,8 +4,8 @@

 def setup(data, 
          target, 
-          train_size=0.7,
-          sampling=True,
+          train_size = 0.7,
+          sampling = True,
          sample_estimator = None,
          categorical_features = None,
          categorical_imputation = 'constant',
@@ -50,6 +50,8 @@ def setup(data,
          transform_target_method = 'box-cox', #new
          session_id = None,
          silent = False,
+          data_split_shuffle = True, #new
+          folds_shuffle = False, #new
          profile = False):
    
    """
@@ -361,7 +363,13 @@ def setup(data,
    When set to True, confirmation of data types is not required. All preprocessing will 
    be performed assuming automatically inferred data types. Not recommended for direct use 
    except for established pipelines.
-    
+
+    data_split_shuffle: bool, default = True
+    If set to False, prevents shuffling of rows when splitting data
+
+    folds_shuffle: bool, default = True
+    If set to False, prevents shuffling of rows when using cross validation
+
    profile: bool, default = False
    If set to true, a data profile for Exploratory Data Analysis will be displayed 
    in an interactive HTML report. 
@@ -1158,8 +1166,8 @@ def setup(data,
            MONITOR UPDATE ENDS
            '''
    
-            X_, X__, y_, y__ = train_test_split(X, y, test_size=1-i, random_state=seed)
-            X_train, X_test, y_train, y_test = train_test_split(X_, y_, test_size=0.3, random_state=seed)
+            X_, X__, y_, y__ = train_test_split(X, y, test_size=1-i, random_state=seed, shuffle=data_split_shuffle)
+            X_train, X_test, y_train, y_test = train_test_split(X_, y_, test_size=0.3, random_state=seed, shuffle=data_split_shuffle)
            model.fit(X_train,y_train)
            pred_ = model.predict(X_test)
            
@@ -1226,7 +1234,7 @@ def setup(data,
        
        if sample_size == '' or sample_size == '1':
            
-            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-train_size, random_state=seed)
+            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-train_size, random_state=seed, shuffle=data_split_shuffle)
            
            '''
            Final display Starts
@@ -1318,10 +1326,10 @@ def setup(data,
            
            sample_n = float(sample_size)
            X_selected, X_discard, y_selected, y_discard = train_test_split(X, y, test_size=1-sample_n,  
-                                                                random_state=seed)
+                                                                random_state=seed, shuffle=data_split_shuffle)
            
            X_train, X_test, y_train, y_test = train_test_split(X_selected, y_selected, test_size=1-train_size, 
-                                                                random_state=seed)
+                                                                random_state=seed, shuffle=data_split_shuffle)
            clear_output()
            
            
@@ -1416,7 +1424,7 @@ def setup(data,
        
        monitor.iloc[1,1:] = 'Splitting Data'
        update_display(monitor, display_id = 'monitor')
-        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-train_size, random_state=seed)
+        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-train_size, random_state=seed, shuffle=data_split_shuffle)
        progress.value += 1
        
        clear_output()
@@ -1709,7 +1717,7 @@ def create_model(estimator = None,
    progress.value += 1
    
    #cross validation setup starts here
-    kf = KFold(fold, random_state=seed)
+    kf = KFold(fold, random_state=seed, shuffle=folds_shuffle)
    
    score_mae =np.empty((0,0))
    score_mse =np.empty((0,0))
@@ -2283,7 +2291,7 @@ def ensemble_model(estimator,
    MONITOR UPDATE ENDS
    '''
    
-    kf = KFold(fold, random_state=seed)
+    kf = KFold(fold, random_state=seed, shuffle=folds_shuffle)
    
    score_mae =np.empty((0,0))
    score_mse =np.empty((0,0))
@@ -2879,7 +2887,7 @@ def compare_models(blacklist = None,
    '''
    
    #cross validation setup starts here
-    kf = KFold(fold, random_state=seed)
+    kf = KFold(fold, random_state=seed, shuffle=folds_shuffle)

    score_mae =np.empty((0,0))
    score_mse =np.empty((0,0))
@@ -3242,7 +3250,7 @@ def blend_models(estimator_list = 'All',
        mask = actual != 0
        return (np.fabs(actual - prediction)/actual)[mask].mean()

-    kf = KFold(fold, random_state=seed)
+    kf = KFold(fold, random_state=seed, shuffle=folds_shuffle)
    
    '''
    MONITOR UPDATE STARTS
@@ -3847,7 +3855,7 @@ def tune_model(estimator = None,
    
    progress.value += 1
    
-    kf = KFold(fold, random_state=seed)
+    kf = KFold(fold, random_state=seed, shuffle=folds_shuffle)

    score_mae =np.empty((0,0))
    score_mse =np.empty((0,0))
@@ -4954,7 +4962,7 @@ def stack_models(estimator_list,
    model.fit(data_X, data_y)
    models_.append(model)
    
-    kf = KFold(fold, random_state=seed) #capturing fold requested by user
+    kf = KFold(fold, random_state=seed, shuffle=folds_shuffle) #capturing fold requested by user

    score_mae =np.empty((0,0))
    score_mse =np.empty((0,0))
@@ -5521,7 +5529,7 @@ def create_stacknet(estimator_list,
    
    meta_model_ = model.fit(data_X,data_y)
    
-    kf = KFold(fold, random_state=seed) #capturing fold requested by user
+    kf = KFold(fold, random_state=seed, shuffle=folds_shuffle) #capturing fold requested by user

    score_mae =np.empty((0,0))
    score_mse =np.empty((0,0))