From 12c69800bdb7e3eb25de2ece9837ed79dc58df2b Mon Sep 17 00:00:00 2001 From: Nandiya Date: Sat, 24 Oct 2020 21:07:27 +0700 Subject: [PATCH] Forecast (#3219) * add forecasting code * add statsmodel * sort import * sort import fix * fixing black * sort requirement * optimize code * try with limited data * sort again * sort fix * sort fix * delete warning and black * add code for forecasting * use black * add more hints to describe * add doctest * finding whitespace * fixing doctest * delete * revert back * revert back * revert back again * revert back again * revert back again * try trimming whitespace * try adding doctypeand etc * fixing reviews * deleting all the space * fixing the build * delete x * add description for safety checker * deleting subscription integer * fix docthint * make def to use function parameters and return values * make def to use function parameters and return values * type hints on data safety checker * optimize code * Update run.py Co-authored-by: FVFYK3GEHV22 Co-authored-by: Christian Clauss --- machine_learning/forecasting/__init__.py | 0 machine_learning/forecasting/ex_data.csv | 114 +++++++++++++++++ machine_learning/forecasting/run.py | 156 +++++++++++++++++++++++ requirements.txt | 1 + 4 files changed, 271 insertions(+) create mode 100644 machine_learning/forecasting/__init__.py create mode 100644 machine_learning/forecasting/ex_data.csv create mode 100644 machine_learning/forecasting/run.py diff --git a/machine_learning/forecasting/__init__.py b/machine_learning/forecasting/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/machine_learning/forecasting/ex_data.csv b/machine_learning/forecasting/ex_data.csv new file mode 100644 index 0000000..1c429e6 --- /dev/null +++ b/machine_learning/forecasting/ex_data.csv @@ -0,0 +1,114 @@ +total_user,total_events,days +18231,0.0,1 +22621,1.0,2 +15675,0.0,3 +23583,1.0,4 +68351,5.0,5 +34338,3.0,6 +19238,0.0,0 +24192,0.0,1 +70349,0.0,2 +103510,0.0,3 +128355,1.0,4 +148484,6.0,5 +153489,3.0,6 +162667,1.0,0 +311430,3.0,1 +435663,7.0,2 +273526,0.0,3 +628588,2.0,4 +454989,13.0,5 +539040,3.0,6 +52974,1.0,0 +103451,2.0,1 +810020,5.0,2 +580982,3.0,3 +216515,0.0,4 +134694,10.0,5 +93563,1.0,6 +55432,1.0,0 +169634,1.0,1 +254908,4.0,2 +315285,3.0,3 +191764,0.0,4 +514284,7.0,5 +181214,4.0,6 +78459,2.0,0 +161620,3.0,1 +245610,4.0,2 +326722,5.0,3 +214578,0.0,4 +312365,5.0,5 +232454,4.0,6 +178368,1.0,0 +97152,1.0,1 +222813,4.0,2 +285852,4.0,3 +192149,1.0,4 +142241,1.0,5 +173011,2.0,6 +56488,3.0,0 +89572,2.0,1 +356082,2.0,2 +172799,0.0,3 +142300,1.0,4 +78432,2.0,5 +539023,9.0,6 +62389,1.0,0 +70247,1.0,1 +89229,0.0,2 +94583,1.0,3 +102455,0.0,4 +129270,0.0,5 +311409,1.0,6 +1837026,0.0,0 +361824,0.0,1 +111379,2.0,2 +76337,2.0,3 +96747,0.0,4 +92058,0.0,5 +81929,2.0,6 +143423,0.0,0 +82939,0.0,1 +74403,1.0,2 +68234,0.0,3 +94556,1.0,4 +80311,0.0,5 +75283,3.0,6 +77724,0.0,0 +49229,2.0,1 +65708,2.0,2 +273864,1.0,3 +1711281,0.0,4 +1900253,5.0,5 +343071,1.0,6 +1551326,0.0,0 +56636,1.0,1 +272782,2.0,2 +1785678,0.0,3 +241866,0.0,4 +461904,0.0,5 +2191901,2.0,6 +102925,0.0,0 +242778,1.0,1 +298608,0.0,2 +322458,10.0,3 +216027,9.0,4 +916052,12.0,5 +193278,12.0,6 +263207,8.0,0 +672948,10.0,1 +281909,1.0,2 +384562,1.0,3 +1027375,2.0,4 +828905,9.0,5 +624188,22.0,6 +392218,8.0,0 +292581,10.0,1 +299869,12.0,2 +769455,20.0,3 +316443,8.0,4 +1212864,24.0,5 +1397338,28.0,6 +223249,8.0,0 +191264,14.0,1 diff --git a/machine_learning/forecasting/run.py b/machine_learning/forecasting/run.py new file mode 100644 index 0000000..467371e --- /dev/null +++ b/machine_learning/forecasting/run.py @@ -0,0 +1,156 @@ +""" +this is code for forecasting +but i modified it and used it for safety checker of data +for ex: you have a online shop and for some reason some data are +missing (the amount of data that u expected are not supposed to be) + then we can use it +*ps : 1. ofc we can use normal statistic method but in this case + the data is quite absurd and only a little^^ + 2. ofc u can use this and modified it for forecasting purpose + for the next 3 months sales or something, + u can just adjust it for ur own purpose +""" + +import numpy as np +import pandas as pd +from sklearn.preprocessing import Normalizer +from sklearn.svm import SVR +from statsmodels.tsa.statespace.sarimax import SARIMAX + + +def linear_regression_prediction( + train_dt: list, train_usr: list, train_mtch: list, test_dt: list, test_mtch: list +) -> float: + """ + First method: linear regression + input : training data (date, total_user, total_event) in list of float + output : list of total user prediction in float + >>> linear_regression_prediction([2,3,4,5], [5,3,4,6], [3,1,2,4], [2,1], [2,2]) + 5.000000000000003 + """ + x = [[1, item, train_mtch[i]] for i, item in enumerate(train_dt)] + x = np.array(x) + y = np.array(train_usr) + beta = np.dot(np.dot(np.linalg.inv(np.dot(x.transpose(), x)), x.transpose()), y) + return abs(beta[0] + test_dt[0] * beta[1] + test_mtch[0] + beta[2]) + + +def sarimax_predictor(train_user: list, train_match: list, test_match: list) -> float: + """ + second method: Sarimax + sarimax is a statistic method which using previous input + and learn its pattern to predict future data + input : training data (total_user, with exog data = total_event) in list of float + output : list of total user prediction in float + >>> sarimax_predictor([4,2,6,8], [3,1,2,4], [2]) + 6.6666671111109626 + """ + order = (1, 2, 1) + seasonal_order = (1, 1, 0, 7) + model = SARIMAX( + train_user, exog=train_match, order=order, seasonal_order=seasonal_order + ) + model_fit = model.fit(disp=False, maxiter=600, method="nm") + result = model_fit.predict(1, len(test_match), exog=[test_match]) + return result[0] + + +def support_vector_regressor(x_train: list, x_test: list, train_user: list) -> float: + """ + Third method: Support vector regressor + svr is quite the same with svm(support vector machine) + it uses the same principles as the SVM for classification, + with only a few minor differences and the only different is that + it suits better for regression purpose + input : training data (date, total_user, total_event) in list of float + where x = list of set (date and total event) + output : list of total user prediction in float + >>> support_vector_regressor([[5,2],[1,5],[6,2]], [[3,2]], [2,1,4]) + 1.634932078116079 + """ + regressor = SVR(kernel="rbf", C=1, gamma=0.1, epsilon=0.1) + regressor.fit(x_train, train_user) + y_pred = regressor.predict(x_test) + return y_pred[0] + + +def interquartile_range_checker(train_user: list) -> float: + """ + Optional method: interquatile range + input : list of total user in float + output : low limit of input in float + this method can be used to check whether some data is outlier or not + >>> interquartile_range_checker([1,2,3,4,5,6,7,8,9,10]) + 2.8 + """ + train_user.sort() + q1 = np.percentile(train_user, 25) + q3 = np.percentile(train_user, 75) + iqr = q3 - q1 + low_lim = q1 - (iqr * 0.1) + return low_lim + + +def data_safety_checker(list_vote: list, actual_result: float) -> None: + """ + Used to review all the votes (list result prediction) + and compare it to the actual result. + input : list of predictions + output : print whether it's safe or not + >>> data_safety_checker([2,3,4],5.0) + Today's data is not safe. + """ + safe = 0 + not_safe = 0 + for i in list_vote: + if i > actual_result: + safe = not_safe + 1 + else: + if abs(abs(i) - abs(actual_result)) <= 0.1: + safe = safe + 1 + else: + not_safe = not_safe + 1 + print(f"Today's data is {'not ' if safe <= not_safe else ''}safe.") + + +# data_input_df = pd.read_csv("ex_data.csv", header=None) +data_input = [[18231, 0.0, 1], [22621, 1.0, 2], [15675, 0.0, 3], [23583, 1.0, 4]] +data_input_df = pd.DataFrame(data_input, columns=["total_user", "total_even", "days"]) + +""" +data column = total user in a day, how much online event held in one day, +what day is that(sunday-saturday) +""" + +# start normalization +normalize_df = Normalizer().fit_transform(data_input_df.values) +# split data +total_date = normalize_df[:, 2].tolist() +total_user = normalize_df[:, 0].tolist() +total_match = normalize_df[:, 1].tolist() + +# for svr (input variable = total date and total match) +x = normalize_df[:, [1, 2]].tolist() +x_train = x[: len(x) - 1] +x_test = x[len(x) - 1 :] + +# for linear reression & sarimax +trn_date = total_date[: len(total_date) - 1] +trn_user = total_user[: len(total_user) - 1] +trn_match = total_match[: len(total_match) - 1] + +tst_date = total_date[len(total_date) - 1 :] +tst_user = total_user[len(total_user) - 1 :] +tst_match = total_match[len(total_match) - 1 :] + + +# voting system with forecasting +res_vote = [] +res_vote.append( + linear_regression_prediction(trn_date, trn_user, trn_match, tst_date, tst_match) +) +res_vote.append(sarimax_predictor(trn_user, trn_match, tst_match)) +res_vote.append(support_vector_regressor(x_train, x_test, trn_user)) + +# check the safety of todays'data^^ +data_safety_checker(res_vote, tst_user) diff --git a/requirements.txt b/requirements.txt index 67d9bbb..8bbb8d5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,6 +11,7 @@ qiskit requests scikit-fuzzy sklearn +statsmodels sympy tensorflow xgboost -- GitLab