From 12c69800bdb7e3eb25de2ece9837ed79dc58df2b Mon Sep 17 00:00:00 2001
From: Nandiya <ri_nandiya@yahoo.com>
Date: Sat, 24 Oct 2020 21:07:27 +0700
Subject: [PATCH] Forecast (#3219)

* add forecasting code

* add statsmodel

* sort import

* sort import fix

* fixing black

* sort requirement

* optimize code

* try with limited data

* sort again

* sort fix

* sort fix

* delete warning and black

* add code for forecasting

* use black

* add more hints to describe

* add doctest

* finding whitespace

* fixing doctest

* delete

* revert back

* revert back

* revert back again

* revert back again

* revert back again

* try trimming whitespace

* try adding doctypeand etc

* fixing reviews

* deleting all the space

* fixing the build

* delete x

* add description for safety checker

* deleting subscription integer

* fix docthint

* make def to use function parameters and return values

* make def to use function parameters and return values

* type hints on data safety checker

* optimize code

* Update run.py

Co-authored-by: FVFYK3GEHV22 <fvfyk3gehv22@FVFYK3GEHV22s-MacBook-Pro.local>
Co-authored-by: Christian Clauss <cclauss@me.com>
---
 machine_learning/forecasting/__init__.py |   0
 machine_learning/forecasting/ex_data.csv | 114 +++++++++++++++++
 machine_learning/forecasting/run.py      | 156 +++++++++++++++++++++++
 requirements.txt                         |   1 +
 4 files changed, 271 insertions(+)
 create mode 100644 machine_learning/forecasting/__init__.py
 create mode 100644 machine_learning/forecasting/ex_data.csv
 create mode 100644 machine_learning/forecasting/run.py

diff --git a/machine_learning/forecasting/__init__.py b/machine_learning/forecasting/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/machine_learning/forecasting/ex_data.csv b/machine_learning/forecasting/ex_data.csv
new file mode 100644
index 0000000..1c429e6
--- /dev/null
+++ b/machine_learning/forecasting/ex_data.csv
@@ -0,0 +1,114 @@
+total_user,total_events,days
+18231,0.0,1
+22621,1.0,2
+15675,0.0,3
+23583,1.0,4
+68351,5.0,5
+34338,3.0,6
+19238,0.0,0
+24192,0.0,1
+70349,0.0,2
+103510,0.0,3
+128355,1.0,4
+148484,6.0,5
+153489,3.0,6
+162667,1.0,0
+311430,3.0,1
+435663,7.0,2
+273526,0.0,3
+628588,2.0,4
+454989,13.0,5
+539040,3.0,6
+52974,1.0,0
+103451,2.0,1
+810020,5.0,2
+580982,3.0,3
+216515,0.0,4
+134694,10.0,5
+93563,1.0,6
+55432,1.0,0
+169634,1.0,1
+254908,4.0,2
+315285,3.0,3
+191764,0.0,4
+514284,7.0,5
+181214,4.0,6
+78459,2.0,0
+161620,3.0,1
+245610,4.0,2
+326722,5.0,3
+214578,0.0,4
+312365,5.0,5
+232454,4.0,6
+178368,1.0,0
+97152,1.0,1
+222813,4.0,2
+285852,4.0,3
+192149,1.0,4
+142241,1.0,5
+173011,2.0,6
+56488,3.0,0
+89572,2.0,1
+356082,2.0,2
+172799,0.0,3
+142300,1.0,4
+78432,2.0,5
+539023,9.0,6
+62389,1.0,0
+70247,1.0,1
+89229,0.0,2
+94583,1.0,3
+102455,0.0,4
+129270,0.0,5
+311409,1.0,6
+1837026,0.0,0
+361824,0.0,1
+111379,2.0,2
+76337,2.0,3
+96747,0.0,4
+92058,0.0,5
+81929,2.0,6
+143423,0.0,0
+82939,0.0,1
+74403,1.0,2
+68234,0.0,3
+94556,1.0,4
+80311,0.0,5
+75283,3.0,6
+77724,0.0,0
+49229,2.0,1
+65708,2.0,2
+273864,1.0,3
+1711281,0.0,4
+1900253,5.0,5
+343071,1.0,6
+1551326,0.0,0
+56636,1.0,1
+272782,2.0,2
+1785678,0.0,3
+241866,0.0,4
+461904,0.0,5
+2191901,2.0,6
+102925,0.0,0
+242778,1.0,1
+298608,0.0,2
+322458,10.0,3
+216027,9.0,4
+916052,12.0,5
+193278,12.0,6
+263207,8.0,0
+672948,10.0,1
+281909,1.0,2
+384562,1.0,3
+1027375,2.0,4
+828905,9.0,5
+624188,22.0,6
+392218,8.0,0
+292581,10.0,1
+299869,12.0,2
+769455,20.0,3
+316443,8.0,4
+1212864,24.0,5
+1397338,28.0,6
+223249,8.0,0
+191264,14.0,1
diff --git a/machine_learning/forecasting/run.py b/machine_learning/forecasting/run.py
new file mode 100644
index 0000000..467371e
--- /dev/null
+++ b/machine_learning/forecasting/run.py
@@ -0,0 +1,156 @@
+"""
+this is code for forecasting
+but i modified it and used it for safety checker of data
+for ex: you have a online shop and for some reason some data are
+missing (the amount of data that u expected are not supposed to be)
+        then we can use it
+*ps : 1. ofc we can use normal statistic method but in this case
+         the data is quite absurd and only a little^^
+      2. ofc u can use this and modified it for forecasting purpose
+         for the next 3 months sales or something,
+         u can just adjust it for ur own purpose
+"""
+
+import numpy as np
+import pandas as pd
+from sklearn.preprocessing import Normalizer
+from sklearn.svm import SVR
+from statsmodels.tsa.statespace.sarimax import SARIMAX
+
+
+def linear_regression_prediction(
+    train_dt: list, train_usr: list, train_mtch: list, test_dt: list, test_mtch: list
+) -> float:
+    """
+    First method: linear regression
+    input : training data (date, total_user, total_event) in list of float
+    output : list of total user prediction in float
+    >>> linear_regression_prediction([2,3,4,5], [5,3,4,6], [3,1,2,4], [2,1], [2,2])
+    5.000000000000003
+    """
+    x = [[1, item, train_mtch[i]] for i, item in enumerate(train_dt)]
+    x = np.array(x)
+    y = np.array(train_usr)
+    beta = np.dot(np.dot(np.linalg.inv(np.dot(x.transpose(), x)), x.transpose()), y)
+    return abs(beta[0] + test_dt[0] * beta[1] + test_mtch[0] + beta[2])
+
+
+def sarimax_predictor(train_user: list, train_match: list, test_match: list) -> float:
+    """
+    second method: Sarimax
+    sarimax is a statistic method which using previous input
+    and learn its pattern to predict future data
+    input : training data (total_user, with exog data = total_event) in list of float
+    output : list of total user prediction in float
+    >>> sarimax_predictor([4,2,6,8], [3,1,2,4], [2])
+    6.6666671111109626
+    """
+    order = (1, 2, 1)
+    seasonal_order = (1, 1, 0, 7)
+    model = SARIMAX(
+        train_user, exog=train_match, order=order, seasonal_order=seasonal_order
+    )
+    model_fit = model.fit(disp=False, maxiter=600, method="nm")
+    result = model_fit.predict(1, len(test_match), exog=[test_match])
+    return result[0]
+
+
+def support_vector_regressor(x_train: list, x_test: list, train_user: list) -> float:
+    """
+    Third method: Support vector regressor
+    svr is quite the same with svm(support vector machine)
+    it uses the same principles as the SVM for classification,
+    with only a few minor differences and the only different is that
+    it suits better for regression purpose
+    input : training data (date, total_user, total_event) in list of float
+    where x = list of set (date and total event)
+    output : list of total user prediction in float
+    >>> support_vector_regressor([[5,2],[1,5],[6,2]], [[3,2]], [2,1,4])
+    1.634932078116079
+    """
+    regressor = SVR(kernel="rbf", C=1, gamma=0.1, epsilon=0.1)
+    regressor.fit(x_train, train_user)
+    y_pred = regressor.predict(x_test)
+    return y_pred[0]
+
+
+def interquartile_range_checker(train_user: list) -> float:
+    """
+    Optional method: interquatile range
+    input : list of total user in float
+    output : low limit of input in float
+    this method can be used to check whether some data is outlier or not
+    >>> interquartile_range_checker([1,2,3,4,5,6,7,8,9,10])
+    2.8
+    """
+    train_user.sort()
+    q1 = np.percentile(train_user, 25)
+    q3 = np.percentile(train_user, 75)
+    iqr = q3 - q1
+    low_lim = q1 - (iqr * 0.1)
+    return low_lim
+
+
+def data_safety_checker(list_vote: list, actual_result: float) -> None:
+    """
+    Used to review all the votes (list result prediction)
+    and compare it to the actual result.
+    input : list of predictions
+    output : print whether it's safe or not
+    >>> data_safety_checker([2,3,4],5.0)
+    Today's data is not safe.
+    """
+    safe = 0
+    not_safe = 0
+    for i in list_vote:
+        if i > actual_result:
+            safe = not_safe + 1
+        else:
+            if abs(abs(i) - abs(actual_result)) <= 0.1:
+                safe = safe + 1
+            else:
+                not_safe = not_safe + 1
+    print(f"Today's data is {'not ' if safe <= not_safe else ''}safe.")
+
+
+# data_input_df = pd.read_csv("ex_data.csv", header=None)
+data_input = [[18231, 0.0, 1], [22621, 1.0, 2], [15675, 0.0, 3], [23583, 1.0, 4]]
+data_input_df = pd.DataFrame(data_input, columns=["total_user", "total_even", "days"])
+
+"""
+data column = total user in a day, how much online event held in one day,
+what day is that(sunday-saturday)
+"""
+
+# start normalization
+normalize_df = Normalizer().fit_transform(data_input_df.values)
+# split data
+total_date = normalize_df[:, 2].tolist()
+total_user = normalize_df[:, 0].tolist()
+total_match = normalize_df[:, 1].tolist()
+
+# for svr (input variable = total date and total match)
+x = normalize_df[:, [1, 2]].tolist()
+x_train = x[: len(x) - 1]
+x_test = x[len(x) - 1 :]
+
+# for linear reression & sarimax
+trn_date = total_date[: len(total_date) - 1]
+trn_user = total_user[: len(total_user) - 1]
+trn_match = total_match[: len(total_match) - 1]
+
+tst_date = total_date[len(total_date) - 1 :]
+tst_user = total_user[len(total_user) - 1 :]
+tst_match = total_match[len(total_match) - 1 :]
+
+
+# voting system with forecasting
+res_vote = []
+res_vote.append(
+    linear_regression_prediction(trn_date, trn_user, trn_match, tst_date, tst_match)
+)
+res_vote.append(sarimax_predictor(trn_user, trn_match, tst_match))
+res_vote.append(support_vector_regressor(x_train, x_test, trn_user))
+
+# check the safety of todays'data^^
+data_safety_checker(res_vote, tst_user)
diff --git a/requirements.txt b/requirements.txt
index 67d9bbb..8bbb8d5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,6 +11,7 @@ qiskit
 requests
 scikit-fuzzy
 sklearn
+statsmodels
 sympy
 tensorflow
 xgboost
-- 
GitLab