提交 783cdd1f 编写于 作者: H hypox64

0.11953

上级 24336635
import dataloader
import transformer
import numpy as np
correlates = []
desc_map,price_map = dataloader.load_train()
price_map['price'] = transformer.normlize(price_map['price'])
key = ''
desc_map[key] = transformer.normlize(desc_map[key])
print(np.correlate(desc_map[key],price_map['price']))
# for key in desc_map.keys():
# desc_map[key] = transformer.normlize(desc_map[key])
# correlates.append(np.correlate(desc_map[key],price_map['price'])[0][1])
# print(correlates)
......@@ -3,11 +3,8 @@ import csv
import numpy as np
import random
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR
import evaluation
from description_map import value_map,fix_key,fix_miss,add_future
from description_map import value_map,fix_key,fix_miss,add_future,fix_LotFrontage
# load description_txt
description_txt = []
......@@ -52,13 +49,6 @@ for i in range(len(colon_indexs)-1):
ori_map[key] = interspace-j-1 #change word to vector
Full_map[desc_key]=ori_map
# def normlize(npdata,justprice = False):
# _mean = np.mean(npdata)
# _std = np.std(npdata)
# if justprice:
# _mean = 180921.195
# _std = 79415.2918
# return (npdata-_mean)/_std
def normlize(npdata,justprice = False):
_min = np.min(npdata)
......@@ -68,8 +58,6 @@ def normlize(npdata,justprice = False):
_max = 755000.0
return (npdata-_min)/(_max-_min)
# def convert2price(tensor):
# return tensor*79415.2918+180921.195
def convert2price(tensor):
return tensor*(755000.0-34900.0)+34900
......@@ -163,28 +151,32 @@ def dict2numpy(dict_data):
return np_data
def load_all(dimension):
desc_map,price_map = load_train()
desc_map = add_future(desc_map)
# print(len(desc_map))
# print(desc_map)
# print(desc_map)
train_price = np.array(price_map['price'])
train_desc = dict2numpy(desc_map)
desc_map = load_test()
train_desc_map,train_price_map = load_train()
test_desc_map = load_test()
desc_map = {}
train_length = len(list(train_desc_map.values())[0])
for key in train_desc_map.keys():
desc_map[key] = np.concatenate((train_desc_map[key],test_desc_map[key]),axis=0)
# desc_map[key] = normlize(desc_map[key])
desc_map['LotFrontage'] = fix_LotFrontage(desc_map)
desc_map['YearBuilt'] = (desc_map['YearBuilt']-1800)/10
desc_map['YearRemodAdd'] = (desc_map['YearRemodAdd']-1800)/10
desc_map = add_future(desc_map)
test_desc = dict2numpy(desc_map)
desc_all = np.concatenate((train_desc,test_desc),axis=0)
for i in range(len(desc_all[0])):
desc_all[:,i] = normlize(desc_all[:,i])
# print(desc_all)
for key in desc_map.keys():
desc_map[key] = normlize(desc_map[key])
desc_all = dict2numpy(desc_map)
pca=PCA(n_components=dimension) #加载PCA算法,设置降维后主成分数目为
desc_all=pca.fit_transform(desc_all)#对样本进行降维
train_price = normlize(train_price,True)
train_desc = desc_all[:len(train_desc)]
test_desc = desc_all[len(train_desc):]
train_price = normlize(np.array(train_price_map['price']),True)
train_desc = desc_all[:train_length]
test_desc = desc_all[train_length:]
return train_desc.astype(np.float32),train_price.astype(np.float32),test_desc.astype(np.float32)
......@@ -197,30 +189,11 @@ def write_csv(prices,path):
csvFile.close()
def main():
load_all(80)
dimension = 80
train_desc,train_price,test_desc = load_all(dimension)
# # KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
# kr = GridSearchCV(KernelRidge(kernel='polynomial', gamma=0.1),
# param_grid={"alpha": [1e0, 0.1, 1e-2, 1e-3],
# "gamma": np.logspace(-2, 2, 5)})
# kr.fit(train_desc, train_price)
# y_kr = kr.predict(test_desc)
# for i in range(len(y_kr)):
# y_kr[i] = convert2price(y_kr[i])
# # print(y_kr.shape)
# print(dimension,evaluation.eval_test(y_kr))
# dimension = 80
# train_desc,train_price,test_desc = load_all(dimension)
# write_csv(train_price, './result.csv')
# # print(data)
# plt.plot(data[1])
# plt.show()
if __name__ == '__main__':
main()
import numpy as np
import pandas as pd
value_map = {}
value_map["MSSubClass"] = {'180':1,
'30':2, '45':2,
......@@ -110,10 +112,18 @@ def fix_miss(name):
else:
return 0
# def fix_LotFrontage(Full_map):
# a = np.zeros(25)
# for i in range(25):
# a[Full_map['Neighborhood'][i]-1] +=
def fix_LotFrontage(Full_map):
data_df = pd.DataFrame(Full_map)
data_df["LotFrontage"] = data_df.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))
return data_df["LotFrontage"].to_numpy()
def binary(npdata):
for i in range(len(npdata)):
if npdata[i]>0:
npdata[i] = 1
else:
npdata[i] = 0
return npdata
def add_future(features):
features["TotalHouse"] = features["TotalBsmtSF"] + features["1stFlrSF"] + features["2ndFlrSF"]
......@@ -141,7 +151,9 @@ def add_future(features):
features["Rooms"] = features["FullBath"]+features["TotRmsAbvGrd"]
features["PorchArea"] = features["OpenPorchSF"]+features["EnclosedPorch"]+features["3SsnPorch"]+features["ScreenPorch"]
features["TotalPlace"] = features["TotalBsmtSF"] + features["1stFlrSF"] + features["2ndFlrSF"] + features["GarageArea"] + features["OpenPorchSF"]+features["EnclosedPorch"]+features["3SsnPorch"]+features["ScreenPorch"]
features['all_quality'] = (features['ExterQual'] +features['BsmtFinType1']+features['BsmtFinType2']+
features['KitchenQual']+features['FireplaceQu']+features['GarageQual']+
features['PoolQC']+features['Fence'])
features['YrBltAndRemod']=features['YearBuilt']+features['YearRemodAdd']
features['TotalSF']=features['TotalBsmtSF'] + features['1stFlrSF'] + features['2ndFlrSF']
......@@ -153,11 +165,15 @@ def add_future(features):
features['Total_porch_sf'] = (features['OpenPorchSF'] + features['3SsnPorch'] +
features['EnclosedPorch'] + features['ScreenPorch'] +
features['WoodDeckSF'])
# features['haspool'] = features['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
# features['has2ndfloor'] = features['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
# features['hasgarage'] = features['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
# features['hasbsmt'] = features['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
# features['hasfireplace'] = features['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)
#random features
random_list = ['GrLivArea','OverallQual','2ndFlrSF','YearBuilt','1stFlrSF','TotalBsmtSF','OverallCond',
'my_Neighborhood','my_SaleCondition','BsmtFinSF1','my_MSZoning','LotArea','GarageCars','YearRemodAdd','GarageArea']
length = len(random_list)
for i in range(length):
for j in range(i,length):
if i != j:
features[random_list[i]+'*'+random_list[j]]=features[random_list[i]]*features[random_list[j]]
return features
\ No newline at end of file
......@@ -32,7 +32,7 @@ def RMSE(records_real,records_predict):
def main():
# my_price = load_submission('./datasets/sample_submission.csv')
my_price = load_submission('./result/0.03688_0.14435.csv')
my_price = load_submission('./result/keras_untuned.csv')
print(eval_test(my_price))
if __name__ == '__main__':
......
from sklearn.linear_model import ElasticNet, Lasso, BayesianRidge, LassoLarsIC
from sklearn.linear_model import ElasticNet, Lasso, BayesianRidge, LassoLarsIC,LassoCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
......@@ -7,29 +7,104 @@ from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
# import xgboost as xgb
# import lightgbm as lgb
from sklearn.svm import SVR
import xgboost as xgb
import lightgbm as lgb
import numpy as np
import torch
import dataloader
import evaluation
import time
import transformer
import time
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
def eval(model,train_x,train_y,test_x):
model.fit(train_x, train_y)
y_pre = model.predict(test_x)
for i in range(len(y_pre)):
y_pre[i] = dataloader.convert2price(y_pre[i])
return evaluation.eval_test(y_pre),y_pre
# print(dimension,evaluation.eval_test(y_pre))
# KernelRidge()
krr = GridSearchCV(KernelRidge(kernel='polynomial'),cv = 3,
param_grid={"alpha": np.logspace(-1, 2, 10),
"gamma": np.logspace(-1, 2, 10)})
dimension = 85
train_desc,train_price,test_desc = dataloader.load_all(dimension)
las = LassoCV(alphas=np.logspace(-5, 2, 50),eps=np.logspace(-5, 2, 20),max_iter=10000)
kr = GridSearchCV(KernelRidge(kernel='polynomial'),
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468,
learning_rate=0.05, max_depth=3,
min_child_weight=1.7817, n_estimators=2200,
reg_alpha=0.4640, reg_lambda=0.8571,
subsample=0.5213, silent=1,
random_state =7, nthread = -1)
# ElasticNet
ENet = GridSearchCV(ElasticNet(max_iter = 10000),
param_grid={"alpha": np.logspace(-3, 2, 6),
"gamma": np.logspace(-2, 2, 5)})
# print(np.logspace(-2, 2, 5))
kr.fit(train_desc, train_price)
y_kr = kr.predict(test_desc)
for i in range(len(y_kr)):
y_kr[i] = dataloader.convert2price(y_kr[i])
# print(y_kr.shape)
print(dimension,evaluation.eval_test(y_kr))
dataloader.write_csv(y_kr, './result/result.csv')
\ No newline at end of file
"l1_ratio": np.logspace(-2, 2, 5)})
#BayesianRidge
bay = BayesianRidge()
#GradientBoostingRegressor
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.01,
max_depth=4, max_features='sqrt',
min_samples_leaf=15, min_samples_split=10,
loss='huber', random_state =5)
#LGBMRegressor
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
learning_rate=0.05, n_estimators=720,
max_bin = 55, bagging_fraction = 0.8,
bagging_freq = 5, feature_fraction = 0.2319,
feature_fraction_seed=9, bagging_seed=9,
min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)
#SVR
model_svr = GridSearchCV(SVR(kernel="rbf"),
param_grid={"C": np.logspace(0, 2, 5),
"gamma": np.logspace(-4, -3, 8),
"epsilon":np.logspace(-4, -3, 5)})
models = [krr,las,model_xgb,ENet,bay,GBoost,model_lgb,model_svr]
model_names = ['krr','las','model_xgb','ENet','bay','GBoost','model_lgb','model_svr']
for model,model_name in zip(models,model_names):
print(model_name)
losss = []
start_dimension = 60
end_dimension = 150
for dimension in range(start_dimension,end_dimension):
t1 = time.time()
train_desc,train_price,test_desc = dataloader.load_all(dimension)
loss,_ = eval(model,train_desc,train_price,test_desc)
losss.append(loss)
t2 = time.time()
print(dimension,loss,' cost time:','%.3f'%(t2-t1),'s')
t1 = time.time()
best_dimension = losss.index(min(losss))+start_dimension
print('Best:',min(losss),' dimension:',best_dimension)
train_desc,train_price,test_desc = dataloader.load_all(best_dimension)
loss,pre = eval(model,train_desc,train_price,test_desc)
dataloader.write_csv(pre, './result/best_'+'%.6f'%loss+'_'+model_name+'.csv')
plt.plot(np.linspace(start_dimension,dimension,dimension-start_dimension+1),losss)
plt.xlabel('PCA dimension')
plt.ylabel('loss')
plt.title(model_name+' :loss_PCA')
plt.savefig('./images/'+'%.6f'%loss+'_'+str(best_dimension)+'_'+model_name+".png")
plt.cla()
# plt.show()
......@@ -10,7 +10,7 @@ class Linear(nn.Module):
def __init__(self, n_feature, n_hidden, n_output):
super(Linear, self).__init__()
self.fc1 = torch.nn.Linear(n_feature, n_hidden) # hidden layer
self.relu = nn.ReLU(inplace=True)
self.relu = nn.Sigmoid()
self.dropout = nn.Dropout(0.2)
self.fc2 = torch.nn.Linear(n_hidden, n_output) # output layer
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
......@@ -8,13 +8,13 @@ import time
import transformer
#parameter
LR = 0.0001
LR = 0.001
EPOCHS = 1000
BATCHSIZE = 64
CONTINUE = False
use_gpu = True
SAVE_FRE = 5
Dimension = 120
Dimension = 128
#load data
train_desc,train_price,test_desc = dataloader.load_all(Dimension)
train_desc.tolist()
......
......@@ -8,4 +8,17 @@ def match_random(a,b):
np.random.shuffle(b)
def random_transform(a,alpha):
return a*random.uniform(1-alpha,1+alpha)
\ No newline at end of file
return a*random.uniform(1-alpha,1+alpha)
def normlize(npdata,justprice = False):
_min = np.min(npdata)
_max = np.max(npdata)
if justprice:
_min = 34900.0
_max = 755000.0
return (npdata-_min)/(_max-_min)
def convert2price(tensor):
return tensor*(755000.0-34900.0)+34900
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册