Add files via upload

19ee26c3 · PyCaret · GitHub · 39b8adbc · 19ee26c3 · 19ee26c3
8 changed file
--- a/README.md
+++ b/README.md
@@ -2,10 +2,10 @@
 PyCaret is end-to-end open source machine learning library for python programming language. Its primary objective is to reduce the cycle time of hypothesis to insights by providing an easy to use high level unified API. PyCaret's vision is to become defacto standard for teaching machine learning and data science. Our strength is in our easy to use unified interface for both supervised and unsupervised learning. It saves time and effort that citizen data scientists, students and researchers spent on coding or learning to code using different interfaces, so that now they can focus on business problem.

 ## Current Release
-The current release is beta 0.0.34 (as of 05/02/2020). A full release is targetted in the first week of February 2020.
+The current release is beta 0.0.35 (as of 07/02/2020). A full release is targetted in the first week of February 2020.

 ## Features Currently Available
-As per beta 0.0.34 following modules are generally available:
+As per beta 0.0.35 following modules are generally available:
 * pycaret.datasets <br/>
 * pycaret.classification (binary and multiclass) <br/>
 * pycaret.regression <br/>
@@ -31,7 +31,7 @@ pip install pycaret
 ```

 ## Quick Start
-As of beta 0.0.34 classification, regression, nlp, arules, anomaly and clustering modules are available.
+As of beta 0.0.35 classification, regression, nlp, arules, anomaly and clustering modules are available.

 ### Classification / Regression


--- a/anomaly.py
+++ b/anomaly.py
@@ -3,13 +3,13 @@
 # License: MIT


-
 def setup(data, 
          categorical_features = None,
          categorical_imputation = 'constant',
          ordinal_features = None, #new
+          high_cardinality_features = None, #latest
          numeric_features = None,
-          numeric_imputation = 'mean',m
+          numeric_imputation = 'mean',
          date_features = None,
          ignore_features = None,
          normalize = False,
@@ -75,6 +75,12 @@ def setup(data,
    be passed as ordinal_features = { 'column_name' : ['low', 'medium', 'high'] }. 
    The list sequence must be in increasing order from lowest to highest.
    
+    high_cardinality_features: string, default = None
+    When the data containts features with high cardinality, they can be compressed
+    into fewer levels by passing them as a list of column names with high cardinality.
+    Features are compressed using frequency distribution. As such original features
+    are replaced with the frequency distribution and converted into numeric variable. 
+    
    numeric_features: string, default = None
    If the inferred data types are not correct, numeric_features can be used to
    overwrite the inferred type. If when running setup the type of 'column1' is 
@@ -308,6 +314,18 @@ def setup(data,
                    text =  "Column name '" + str(i) + "' doesnt contain any level named '" + str(j) + "'."
                    sys.exit(text)
    
+    #high_cardinality_features
+    if high_cardinality_features is not None:
+        if type(high_cardinality_features) is not list:
+            sys.exit("(Type Error): high_cardinality_features param only accepts name of columns as a list. ")
+        
+    if high_cardinality_features is not None:
+        data_cols = data.columns
+        #data_cols = data_cols.drop(target)
+        for i in high_cardinality_features:
+            if i not in data_cols:
+                sys.exit("(Value Error): Column type forced is either target column or doesn't exist in the dataset.")
+                
    #checking numeric imputation
    allowed_numeric_imputation = ['mean', 'median']
    if numeric_imputation not in allowed_numeric_imputation:
@@ -629,7 +647,24 @@ def setup(data,
        ordinal_columns_and_categories_pass = ordinal_features
    else:
        ordinal_columns_and_categories_pass = {}
+       
+    #high cardinality
+    if apply_ordinal_encoding_pass is True:
+        ordinal_columns_and_categories_pass = ordinal_features
+    else:
+        ordinal_columns_and_categories_pass = {}
        
+    if high_cardinality_features is not None:
+        apply_cardinality_reduction_pass = True
+    else:
+        apply_cardinality_reduction_pass = False
+        
+    cardinal_method_pass = 'count'
+        
+    if apply_cardinality_reduction_pass:
+        cardinal_features_pass = high_cardinality_features
+    else:
+        cardinal_features_pass = []
    
    #display dtypes
    if supervised is False:
@@ -644,6 +679,9 @@ def setup(data,
                                       categorical_features = cat_features_pass,
                                       apply_ordinal_encoding = apply_ordinal_encoding_pass, #new
                                       ordinal_columns_and_categories = ordinal_columns_and_categories_pass,
+                                       apply_cardinality_reduction = apply_cardinality_reduction_pass, #latest
+                                       cardinal_method = cardinal_method_pass, #latest
+                                       cardinal_features = cardinal_features_pass, #latest
                                       numerical_features = numeric_features_pass,
                                       time_features = date_features_pass,
                                       features_todrop = ignore_features_pass,
@@ -737,7 +775,12 @@ def setup(data,
        group_features_grid = True
    else:
        group_features_grid = False
-        
+     
+    if high_cardinality_features is not None:
+        high_cardinality_features_grid = True
+    else:
+        high_cardinality_features_grid = False
+
    learned_types = preprocess.dtypes.learent_dtypes
    #learned_types.drop(target, inplace=True)

@@ -788,9 +831,10 @@ def setup(data,
    functions = pd.DataFrame ( [ ['session_id ', seed ],
                                 ['Original Data ', shape ],
                                 ['Missing Values ', missing_flag],
-                                 ['Numeric Features ', float_type-1 ],
-                                 ['Categorical Features ', cat_type ],
+                                 ['Numeric Features ', str(float_type-1) ],
+                                 ['Categorical Features ', str(cat_type) ],
                                 ['Ordinal Features ', ordinal_features_grid],
+                                 ['High Cardinality Features ', high_cardinality_features_grid],
                                 ['Transformed Data ', shape_transformed ],
                                 ['Numeric Imputer ', numeric_imputation],
                                 ['Categorical Imputer ', categorical_imputation],
@@ -1271,6 +1315,7 @@ def assign_model(model,
    return data__


+
 def tune_model(model=None,
               supervised_target=None,
               method='drop',
@@ -1752,12 +1797,21 @@ def tune_model(model=None,
    else:
        ordinal_features_pass = prep_param.ordinal.info_as_dict
    
+    #HIGH CARDINALITY
+    #---------------#
+    
+    if 'Empty' in str(prep_param.cardinality):
+        high_cardinality_features_pass = None
+    else:
+        high_cardinality_features_pass = prep_param.cardinality.feature
+        
    global setup_without_target
    
    setup_without_target = setup(data = data_,
                                 categorical_features = cat_pass,
                                 categorical_imputation = cat_impute_pass,
                                 ordinal_features = ordinal_features_pass, #new
+                                 high_cardinality_features = high_cardinality_features_pass, #latest
                                 numeric_features = num_pass,
                                 numeric_imputation = num_impute_pass,
                                 date_features = time_pass,
@@ -2408,6 +2462,7 @@ def tune_model(model=None,



+
 def plot_model(model,
               plot = 'tsne',
               feature = None):

--- a/classification.py
+++ b/classification.py
--- a/clustering.py
+++ b/clustering.py
@@ -8,6 +8,7 @@ def setup(data,
          categorical_features = None,
          categorical_imputation = 'constant',
          ordinal_features = None, #new
+          high_cardinality_features = None, #latest
          numeric_features = None,
          numeric_imputation = 'mean',
          date_features = None,
@@ -75,6 +76,12 @@ def setup(data,
    be passed as ordinal_features = { 'column_name' : ['low', 'medium', 'high'] }. 
    The list sequence must be in increasing order from lowest to highest.
    
+    high_cardinality_features: string, default = None
+    When the data containts features with high cardinality, they can be compressed
+    into fewer levels by passing them as a list of column names with high cardinality.
+    Features are compressed using frequency distribution. As such original features
+    are replaced with the frequency distribution and converted into numeric variable. 
+    
    numeric_features: string, default = None
    If the inferred data types are not correct, numeric_features can be used to
    overwrite the inferred type. If when running setup the type of 'column1' is 
@@ -308,6 +315,18 @@ def setup(data,
                    text =  "Column name '" + str(i) + "' doesnt contain any level named '" + str(j) + "'."
                    sys.exit(text)
    
+    #high_cardinality_features
+    if high_cardinality_features is not None:
+        if type(high_cardinality_features) is not list:
+            sys.exit("(Type Error): high_cardinality_features param only accepts name of columns as a list. ")
+        
+    if high_cardinality_features is not None:
+        data_cols = data.columns
+        #data_cols = data_cols.drop(target)
+        for i in high_cardinality_features:
+            if i not in data_cols:
+                sys.exit("(Value Error): Column type forced is either target column or doesn't exist in the dataset.")
+        
    #checking numeric imputation
    allowed_numeric_imputation = ['mean', 'median']
    if numeric_imputation not in allowed_numeric_imputation:
@@ -624,12 +643,24 @@ def setup(data,
        apply_ordinal_encoding_pass = True
    else:
        apply_ordinal_encoding_pass = False
-        
+     
+    #high cardinality
    if apply_ordinal_encoding_pass is True:
        ordinal_columns_and_categories_pass = ordinal_features
    else:
        ordinal_columns_and_categories_pass = {}
        
+    if high_cardinality_features is not None:
+        apply_cardinality_reduction_pass = True
+    else:
+        apply_cardinality_reduction_pass = False
+        
+    cardinal_method_pass = 'count'
+        
+    if apply_cardinality_reduction_pass:
+        cardinal_features_pass = high_cardinality_features
+    else:
+        cardinal_features_pass = []
    
    #display dtypes
    if supervised is False:
@@ -644,6 +675,9 @@ def setup(data,
                                       categorical_features = cat_features_pass,
                                       apply_ordinal_encoding = apply_ordinal_encoding_pass, #new
                                       ordinal_columns_and_categories = ordinal_columns_and_categories_pass,
+                                       apply_cardinality_reduction = apply_cardinality_reduction_pass, #latest
+                                       cardinal_method = cardinal_method_pass, #latest
+                                       cardinal_features = cardinal_features_pass, #latest
                                       numerical_features = numeric_features_pass,
                                       time_features = date_features_pass,
                                       features_todrop = ignore_features_pass,
@@ -737,6 +771,11 @@ def setup(data,
        group_features_grid = True
    else:
        group_features_grid = False
+     
+    if high_cardinality_features is not None:
+        high_cardinality_features_grid = True
+    else:
+        high_cardinality_features_grid = False
        
    learned_types = preprocess.dtypes.learent_dtypes
    #learned_types.drop(target, inplace=True)
@@ -788,9 +827,10 @@ def setup(data,
    functions = pd.DataFrame ( [ ['session_id ', seed ],
                                 ['Original Data ', shape ],
                                 ['Missing Values ', missing_flag],
-                                 ['Numeric Features ', float_type-1 ],
-                                 ['Categorical Features ', cat_type ],
+                                 ['Numeric Features ', str(float_type-1) ],
+                                 ['Categorical Features ', str(cat_type) ],
                                 ['Ordinal Features ', ordinal_features_grid],
+                                 ['High Cardinality Features ', high_cardinality_features_grid],
                                 ['Transformed Data ', shape_transformed ],
                                 ['Numeric Imputer ', numeric_imputation],
                                 ['Categorical Imputer ', categorical_imputation],
@@ -1700,12 +1740,21 @@ def tune_model(model=None,
    else:
        ordinal_features_pass = prep_param.ordinal.info_as_dict
    
+    #HIGH CARDINALITY
+    #---------------#
+    
+    if 'Empty' in str(prep_param.cardinality):
+        high_cardinality_features_pass = None
+    else:
+        high_cardinality_features_pass = prep_param.cardinality.feature
+    
    global setup_without_target
    
    setup_without_target = setup(data = data_,
                                 categorical_features = cat_pass,
                                 categorical_imputation = cat_impute_pass,
                                 ordinal_features = ordinal_features_pass, #new
+                                 high_cardinality_features = high_cardinality_features_pass, #latest
                                 numeric_features = num_pass,
                                 numeric_imputation = num_impute_pass,
                                 date_features = time_pass,

--- a/nlp.py
+++ b/nlp.py
@@ -2553,28 +2553,24 @@ def evaluate_model(model):
        
    from ipywidgets import widgets
    from ipywidgets.widgets import interact, fixed, interact_manual
+    import numpy as np
    
    """
    generate sorted list
    
    """
    
-    assigned_df = assign_model(model)
-    dd = list(assigned_df['Dominant_Topic'].unique())
-
-    dd2 = []
-    for i in dd:
-        dd2.append(i.split())
-
-    dd3 = []
-    for i in dd2:
-        dd3.append(int(i[1]))
-
-    dd3.sort()
+    try:
+        n_topic_assigned = len(model.show_topics())
+    except:
+        try:
+            n_topic_assigned = model.num_topics
+        except:
+            n_topic_assigned = model.n_components

    final_list = []
-    for i in dd3:
-        final_list.append('Topic '+str(i))
+    for i in range(0,n_topic_assigned):
+        final_list.append('Topic ' +str(i))

    a = widgets.ToggleButtons(
                            options=[('Frequency Plot', 'frequency'),

--- a/preprocess.py
+++ b/preprocess.py
@@ -39,8 +39,8 @@ import datefinder
 from datetime import datetime
 import calendar
 from sklearn.preprocessing import LabelEncoder
-# pd.set_option('display.max_columns', 500)
-# pd.set_option('display.max_rows', 500)
+#pd.set_option('display.max_columns', 500)
+#pd.set_option('display.max_rows', 500)

 #ignore warnings
 import warnings
@@ -1348,6 +1348,146 @@ class Cluster_Entire_Data(BaseEstimator,TransformerMixin):
    data['data_cluster'] = predict
    data['data_cluster'] = data['data_cluster'].astype('object')

+    return(data)
+#__________________________________________________________________________________________________________________________________________
+# Clustering catagorical data
+class Reduce_Cardinality_with_Clustering(BaseEstimator,TransformerMixin):
+  '''
+    - Reduces the level of catagorical column / cardinality through clustering 
+    - Highly recommended to run the DataTypes_Auto_infer class first
+      Args:
+          target_variable: target variable (integer or numerical only)
+          catagorical_feature: list of features on which clustering  is to be applied / cardinality to be reduced
+          check_clusters_upto: to determine optimum number of kmeans clusters, set the uppler limit of clusters
+  '''
+
+  def __init__(self, target_variable, catagorical_feature=[], check_clusters_upto=30,random_state=42):
+    self.target = target_variable
+    self.feature = catagorical_feature
+    self.check_clusters = check_clusters_upto + 1
+    self.random= random_state
+    
+    
+
+  def fit(self,data,y=None):
+    return(None)
+
+  def transform(self,dataset,y=None):
+    data= dataset.copy()
+    # we already know which leval belongs to whihc cluster , so all w need is to replace levels with clusters we already have from training data set
+    for i,z in zip(self.feature,self.ph_data):
+      data[i] = data[i].replace(list(z['levels']),z['cluster'])
+    
+    return(data)
+
+  def fit_transform(self,dataset,y=None):
+    data = dataset.copy()
+    # first convert to dummy
+    if len(data.select_dtypes(include='object').columns)>0:
+      self.dummy = Dummify(self.target)
+      data_t = self.dummy.fit_transform(data.drop(self.feature,axis=1))
+      #data_t1 = data_t1.drop(self.target,axis=1)
+    else:
+      data_t = data.drop(self.feature,axis=1).copy()
+
+    # now make PLS 
+    self.pls = PLSRegression(n_components=2) # since we are only using two componenets to group #PLSRegression(n_components=len(data_t1.columns)-1)
+    data_pls = self.pls.fit_transform(data_t.drop(self.target,axis=1),data_t[self.target])[0]
+
+    # # now we will take one component and then we calculate mean, median, min, max and sd of that one component grouped by the catagorical levels
+    self.ph_data = []
+    self.ph_clusters = []
+    for i in self.feature:
+      data_t1 = pd.DataFrame(dict(levels=data[i],comp1=data_pls[:,0],comp2=data_pls[:,1]),index=data.index)
+      # now group by feature
+      data_t1 = data_t1.groupby('levels')
+      data_t1 = data_t1[['comp1','comp2']].agg(['mean', 'median','min','max','std']) #this gives us a df with only numeric columns (min , max ) and level as index
+      # some time if a level has only one record  its std will come up as NaN, so convert NaN to 1
+      data_t1.fillna(1,inplace=True)
+      
+      # now number of clusters cant be more than the number of samples in aggregated data , so
+      self.check_clusters = min (self.check_clusters,len(data_t1)) 
+
+      # # we are goign to make a place holder , for 2 to 20 clusters
+      self.ph = pd.DataFrame(np.arange(2,self.check_clusters,1), columns= ['clusters']) 
+      self.ph['Silhouette'] = float(0)
+      self.ph['calinski'] = float(0)
+
+    # Now start making clusters
+      for k in self.ph.index:
+          c =  self.ph['clusters'][k]
+          self.k_object =  cluster.KMeans(n_clusters= c,init='k-means++',precompute_distances='auto',n_init=10,random_state=self.random)
+          self.k_object.fit(data_t1)
+          self.ph.iloc[k,1] = metrics.silhouette_score(data_t1,self.k_object.labels_)
+          self.ph.iloc[k,2] = metrics.calinski_harabasz_score(data_t1,self.k_object.labels_)
+      
+      # now standardize the scores and make a total column
+      m = MinMaxScaler((-1,1))
+      self.ph['calinski'] = m.fit_transform(np.array(self.ph['calinski']).reshape(-1,1))
+      self.ph['Silhouette'] = m.fit_transform(np.array(self.ph['Silhouette']).reshape(-1,1))
+      self.ph['total']= self.ph['Silhouette'] + self.ph['calinski']
+      # sort it by total column and take the first row column 0 , that would represent the optimal clusters
+      try:
+        self.clusters = int(self.ph[self.ph['total'] == max(self.ph['total'])]['clusters'])
+      except: # in case there isnt a decisive measure , take calinski as yeard stick
+        self.clusters= int(self.ph[self.ph['calinski'] == max(self.ph['calinski'])]['clusters'])
+      self.ph_clusters.append(self.ph)
+      # Now make the final cluster object
+      self.k_object =  cluster.KMeans(n_clusters= self.clusters,init='k-means++',precompute_distances='auto',n_init=10,random_state=self.random)
+      # now do fit predict
+      predict =self.k_object.fit_predict(data_t1)
+      # put it back with the group by aggregate columns
+      data_t1['cluster'] = predict
+      data_t1['cluster'] = data_t1['cluster'] .apply(str)
+      # now we dont need all the columns, only the cluster column is required along with the index (index also has a name , we  groupy as "levels")
+      data_t1 = data_t1[['cluster']]
+      # now convert index ot the column
+      data_t1.reset_index(level=0, inplace=True) # this table now only contains every level and its cluster
+      #self.data_t1= data_t1
+      # we can now replace cluster with the original level in the original data frame
+      data[i] = data[i].replace(list(data_t1['levels']),data_t1['cluster'])
+      self.ph_data.append(data_t1)
+
+    return(data)
+
+#____________________________________________________________________________________________________________________________________________
+# Clustering catagorical data
+class Reduce_Cardinality_with_Counts(BaseEstimator,TransformerMixin):
+  '''
+    - Reduces the level of catagorical column by replacing levels with their count & converting objects into float
+      Args:
+          catagorical_feature: list of features on which clustering is to be applied
+  '''
+
+  def __init__(self, catagorical_feature=[]):
+    self.feature = catagorical_feature    
+
+  def fit(self,data,y=None):
+    return(None)
+
+  def transform(self,dataset,y=None):
+    data= dataset.copy()
+    # we already know level counts 
+    for i,z,k in zip(self.feature,self.ph_data,self.ph_u):
+      data[i] = data[i].replace(k,z['counts'])
+      data[i] = data[i].astype('float64')
+    
+    return(data)
+
+  def fit_transform(self,dataset,y=None):
+    data = dataset.copy()
+    # 
+    self.ph_data = []
+    self.ph_u= []
+    for i in self.feature:
+      data_t1 = pd.DataFrame(dict(levels=data[i].groupby(data[i], sort=False).count().index,counts =data[i].groupby(data[i], sort=False).count().values))
+      u = data[i].unique()
+      # replace levels with counts
+      data[i].replace(u,data_t1['counts'],inplace=True)
+      data[i] = data[i].astype('float64')
+      self.ph_data.append(data_t1)
+      self.ph_u.append(u)
+    
    return(data)
 #____________________________________________________________________________________________________________________________________________
 # take noneliner transformations
@@ -2082,6 +2222,7 @@ def Preprocess_Path_One(train_data,target_variable,ml_usecase=None,test_data =No
                                club_rare_levels = False, rara_level_threshold_percentage =0.05,
                                apply_untrained_levels_treatment= False,untrained_levels_treatment_method = 'least frequent',
                                apply_ordinal_encoding = False, ordinal_columns_and_categories= {},
+                                apply_cardinality_reduction=False, cardinal_method = 'cluster', cardinal_features=[],
                                apply_binning=False, features_to_binn =[],
                                apply_grouping= False , group_name=[] , features_to_group_ListofList=[[]],
                                apply_polynomial_trigonometry_features = False, max_polynomial=2,trigonometry_calculations=['sin','cos','tan'], top_poly_trig_features_to_select_percentage=.20,
@@ -2106,19 +2247,20 @@ def Preprocess_Path_One(train_data,target_variable,ml_usecase=None,test_data =No
      - 4) Drop categorical variables that have zero variance or near zero variance
      - 5) Club categorical variables levels togather as a new level (other_infrequent) that are rare / at the bottom 5% of the variable distribution
      - 6) Club unseen levels in test dataset with most/least frequent levels in train dataset 
-      - 7) Generate sub features from time feature such as 'month','weekday',is_month_end','is_month_start' & 'hour'
-      - 8) Group features by calculating min, max, mean, median & sd of similar features
-      - 9) Make nonliner features (polynomial, sin , cos & tan)
-      -10) Scales & Power Transform (zscore,minmax,yeo-johnson,quantile,maxabs,robust) , including option to transform target variable
-      -11) Apply binning to continious variable when numeric features are provided as a list 
-      -12) Detect & remove outliers using isolation forest, knn and PCA
-      -13) Apply clusters to segment entire data
-      -14) One Hot / Dummy encoding
-      -15) Remove special characters from column names such as commas, square brackets etc to make it competible with jason dependednt models
-      -16) Feature Selection throuh Random Forest , LightGBM and Pearson Correlation
-      -17) Fix multicollinearity
-      -18) Feature Interaction (DFS) , multiply , divided , add and substract features
-      -19) Apply diamension reduction techniques such as pca_liner, pca_kernal, incremental, tsne 
+      - 7) Reduce high cardinality in categorical features using clustering or counts
+      - 8) Generate sub features from time feature such as 'month','weekday',is_month_end','is_month_start' & 'hour'
+      - 9) Group features by calculating min, max, mean, median & sd of similar features
+      -10) Make nonliner features (polynomial, sin , cos & tan)
+      -11) Scales & Power Transform (zscore,minmax,yeo-johnson,quantile,maxabs,robust) , including option to transform target variable
+      -12) Apply binning to continious variable when numeric features are provided as a list 
+      -13) Detect & remove outliers using isolation forest, knn and PCA
+      -14) Apply clusters to segment entire data
+      -15) One Hot / Dummy encoding
+      -16) Remove special characters from column names such as commas, square brackets etc to make it competible with jason dependednt models
+      -17) Feature Selection throuh Random Forest , LightGBM and Pearson Correlation
+      -18) Fix multicollinearity
+      -19) Feature Interaction (DFS) , multiply , divided , add and substract features
+      -20) Apply diamension reduction techniques such as pca_liner, pca_kernal, incremental, tsne 
          - except for pca_liner, all other method only takes number of component (as integer) i.e no variance explaination metohd available  
  '''
  global c2, subcase
@@ -2171,6 +2313,22 @@ def Preprocess_Path_One(train_data,target_variable,ml_usecase=None,test_data =No
  else:
    new_levels= Empty()

+  # untrained levels in test(ordinal specific)
+  if apply_untrained_levels_treatment ==  True:
+    global new_levels1 
+    new_levels1 = New_Catagorical_Levels_in_TestData(target=target_variable,replacement_strategy=untrained_levels_treatment_method)
+  else:
+    new_levels1= Empty()
+ 
+  # cardinality:
+  global cardinality
+  if apply_cardinality_reduction==True and cardinal_method =='cluster':
+    cardinality = Reduce_Cardinality_with_Clustering(target_variable=target_variable, catagorical_feature=cardinal_features, check_clusters_upto=50,random_state=random_state)
+  elif apply_cardinality_reduction==True and cardinal_method =='count':
+    cardinality = Reduce_Cardinality_with_Counts(catagorical_feature=cardinal_features)
+  else:
+    cardinality= Empty()
+
  # ordinal coding
  if apply_ordinal_encoding == True:
    global ordinal
@@ -2278,10 +2436,12 @@ def Preprocess_Path_One(train_data,target_variable,ml_usecase=None,test_data =No
  pipe = Pipeline([
                 ('dtypes',dtypes),
                 ('imputer',imputer),
+                 ('new_levels1',new_levels1), # specifically used for ordinal, so that if a new level comes in a feature that was marked ordinal can be handled 
                 ('ordinal',ordinal),
                 ('znz',znz),
                 ('club_R_L',club_R_L),
                 ('new_levels',new_levels),
+                 ('cardinality',cardinality),
                 ('feature_time',feature_time),
                 ('group',group),
                 ('nonliner',nonliner),
@@ -2313,6 +2473,7 @@ def Preprocess_Path_Two(train_data,ml_usecase=None,test_data =None,categorical_f
                                apply_zero_nearZero_variance = False,
                                club_rare_levels = False, rara_level_threshold_percentage =0.05,
                                apply_untrained_levels_treatment= False,untrained_levels_treatment_method = 'least frequent',
+                                apply_cardinality_reduction=False, cardinal_method = 'cluster', cardinal_features=[],
                                apply_ordinal_encoding = False, ordinal_columns_and_categories= {}, 
                                apply_binning=False, features_to_binn =[],
                                apply_grouping= False , group_name=[] , features_to_group_ListofList=[[]],
@@ -2334,15 +2495,16 @@ def Preprocess_Path_Two(train_data,ml_usecase=None,test_data =None,categorical_f
      - 4) Drop categorical variables that have zero variance or near zero variance
      - 5) Club categorical variables levels togather as a new level (other_infrequent) that are rare / at the bottom 5% of the variable distribution
      - 6) Club unseen levels in test dataset with most/least frequent levels in train dataset 
-      - 7) Generate sub features from time feature such as 'month','weekday',is_month_end','is_month_start' & 'hour'
-      - 8) Group features by calculating min, max, mean, median & sd of similar features
-      - 9) Scales & Power Transform (zscore,minmax,yeo-johnson,quantile,maxabs,robust) , including option to transform target variable
-      -10) Apply binning to continious variable when numeric features are provided as a list 
-      -11) Detect & remove outliers using isolation forest, knn and PCA
-      -12) One Hot / Dummy encoding
-      -13) Remove special characters from column names such as commas, square brackets etc to make it competible with jason dependednt models
-      -14) Fix multicollinearity
-      -15) Apply diamension reduction techniques such as pca_liner, pca_kernal, incremental, tsne 
+      - 7) Reduce high cardinality in categorical features using clustering or counts
+      - 8) Generate sub features from time feature such as 'month','weekday',is_month_end','is_month_start' & 'hour'
+      - 9) Group features by calculating min, max, mean, median & sd of similar features
+      -10) Scales & Power Transform (zscore,minmax,yeo-johnson,quantile,maxabs,robust) , including option to transform target variable
+      -11) Apply binning to continious variable when numeric features are provided as a list 
+      -12) Detect & remove outliers using isolation forest, knn and PCA
+      -13) One Hot / Dummy encoding
+      -14) Remove special characters from column names such as commas, square brackets etc to make it competible with jason dependednt models
+      -15) Fix multicollinearity
+      -16) Apply diamension reduction techniques such as pca_liner, pca_kernal, incremental, tsne 
          - except for pca_liner, all other method only takes number of component (as integer) i.e no variance explaination metohd available 
  '''
  
@@ -2392,6 +2554,22 @@ def Preprocess_Path_Two(train_data,ml_usecase=None,test_data =None,categorical_f
    new_levels = New_Catagorical_Levels_in_TestData(target=target_variable,replacement_strategy=untrained_levels_treatment_method)
  else:
    new_levels= Empty()
+
+  # untrained levels in test(ordinal specific)
+  if apply_untrained_levels_treatment ==  True:
+    global new_levels1 
+    new_levels1 = New_Catagorical_Levels_in_TestData(target=target_variable,replacement_strategy=untrained_levels_treatment_method)
+  else:
+    new_levels1= Empty()
+
+  # cardinality:
+  global cardinality
+  if apply_cardinality_reduction==True and cardinal_method =='cluster':
+    cardinality = Reduce_Cardinality_with_Clustering(target_variable=target_variable, catagorical_feature=cardinal_features, check_clusters_upto=50,random_state=random_state)
+  elif apply_cardinality_reduction==True and cardinal_method =='count':
+    cardinality = Reduce_Cardinality_with_Counts(catagorical_feature=cardinal_features)
+  else:
+    cardinality= Empty()
  
  # ordinal coding
  if apply_ordinal_encoding == True:
@@ -2463,10 +2641,12 @@ def Preprocess_Path_Two(train_data,ml_usecase=None,test_data =None,categorical_f
  pipe = Pipeline([
                 ('dtypes',dtypes),
                 ('imputer',imputer),
+                 ('new_levels1',new_levels1), # specifically used for ordinal, so that if a new level comes in a feature that was marked ordinal can be handled 
                 ('ordinal',ordinal),
                 ('znz',znz),
                 ('club_R_L',club_R_L),
                 ('new_levels',new_levels),
+                 ('cardinality',cardinality),
                 ('feature_time',feature_time),
                 ('group',group),
                 ('scaling',scaling),
@@ -2485,4 +2665,4 @@ def Preprocess_Path_Two(train_data,ml_usecase=None,test_data =None,categorical_f
    return(train_t.drop(target_variable,axis=1),test_t)
  else:
    train_t = pipe.fit_transform(train_data)
-    return(train_t.drop(target_variable,axis=1))
\ No newline at end of file
+    return(train_t.drop(target_variable,axis=1))
--- a/regression.py
+++ b/regression.py
--- a/setup.py
+++ b/setup.py
@@ -27,7 +27,7 @@ def readme():

 setup(
    name="pycaret",
-    version="0.0.34",
+    version="0.0.35",
    description="A Python package for supervised and unsupervised machine learning.",
    long_description=readme(),
    long_description_content_type="text/markdown",