提交 24336635 编写于 作者: H hypox64

0.12229 by KernelRidge

上级 4049f4a5
......@@ -2,7 +2,12 @@ import os
import csv
import numpy as np
import random
# import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR
import evaluation
from description_map import value_map,fix_key,fix_miss,add_future
# load description_txt
description_txt = []
......@@ -14,38 +19,46 @@ for i,line in enumerate(open('./datasets/data_description.txt'),0):
description_txt.append(line)
colon_indexs.append(524)#the end of description
description_length = len(colon_indexs)-1
print('Description length:',description_length)
descriptions = []
Full_map = {}
desc_keys = []
for i in range(len(colon_indexs)-1):
mapping = {}
description_title = description_txt[colon_indexs[i]]
ori_map = {}
my_map = {}
desc_key = description_txt[colon_indexs[i]]
desc_key = desc_key[:desc_key.find(':')]
desc_keys.append(desc_key)
# print(desc_key)
interspace = colon_indexs[i+1]-colon_indexs[i]-2 #two space line
if interspace == 0:
mapping['Just_num'] = 'None'
descriptions.append(mapping)
ori_map['Just_num'] = 'None'
Full_map[desc_key]=ori_map
else:
for j in range(interspace-1): #del low space line
line = description_txt[colon_indexs[i]+j+2]
mapping_key = line[:line.find('\t')]
key = line[:line.find('\t')]
#data_description.txt is wrong here
if mapping_key == 'NA ':
mapping_key = 'NA'
if mapping_key == 'WD ':
mapping_key = 'WD'
if mapping_key == 'BrkComm' or mapping_key =='Brk Cmn':
mapping_key = 'BrkCmn'
mapping[mapping_key] = j
descriptions.append(mapping)
# print(descriptions)
def match_random(a,b):
state = np.random.get_state()
np.random.shuffle(a)
np.random.set_state(state)
np.random.shuffle(b)
if key == 'NA ':
key = 'NA'
if key == 'WD ':
key = 'WD'
if key == 'BrkComm' or key =='Brk Cmn':
key = 'BrkCmn'
if desc_key in value_map:
my_map[key] = value_map[desc_key][key]
Full_map['my_'+desc_key]=my_map
ori_map[key] = interspace-j-1 #change word to vector
Full_map[desc_key]=ori_map
# def normlize(npdata,justprice = False):
# _mean = np.mean(npdata)
# _std = np.std(npdata)
# if justprice:
# _mean = 180921.195
# _std = 79415.2918
# return (npdata-_mean)/_std
def normlize(npdata,justprice = False):
_min = np.min(npdata)
......@@ -55,92 +68,124 @@ def normlize(npdata,justprice = False):
_max = 755000.0
return (npdata-_min)/(_max-_min)
# def convert2price(tensor):
# return tensor*79415.2918+180921.195
def convert2price(tensor):
return tensor*(755000.0-34900.0)+34900
def fix_key(key):
#csv is wrong here
if key == 'Wd Shng':
key='WdShing'
if key == '2fmCon':
key='2FmCon'
if key == 'NAmes':
key='Names'
if key == 'Duplex':
key='Duplx'
if key == 'CmentBd':
key='CemntBd'
if key == 'C (all)':
key='C'
if key == 'Twnhs':
key='TwnhsI'
if key == 'Brk Cmn' or key =='BrkComm':
key='BrkCmn'
else:
key = key
return key
def load_train():
##load train csv
desc_map = {}
price_map = {}
csv_data = []
#train_del_1299_524.csv
reader = csv.reader(open('./datasets/train.csv'))
for line in reader:
csv_data.append(line)
id_length = len(csv_data)-1
data = np.zeros((id_length,description_length+1))
for i in range(id_length):
for j in range(description_length+1):
key = csv_data[i+1][j+1]
for i in range(80):
arr = np.zeros(id_length)
my_arr = np.zeros(id_length)
for j in range(id_length):
key = csv_data[j+1][i+1]
key = fix_key(key)
if j == description_length:
data[i][j] = float(key)
if i == 79:
arr[j] = float(key)
else:
if key in descriptions[j]: #SalePrice
data[i][j] = float(descriptions[j][key])
else:#just num here
# print(i,j)
#my map
if desc_keys[i] in value_map:
if key == 'NA':
my_arr[j] = fix_miss(desc_keys[i])
else:
my_arr[j] = Full_map['my_'+desc_keys[i]][key]
#auto map
if key in Full_map[desc_keys[i]]:
arr[j] = Full_map[desc_keys[i]][key]
else:
if key == 'NA':
key = 0;
data[i][j] = float(key)
return data
arr[j] = fix_miss(desc_keys[i])
else:
arr[j] = float(key)
if i == 79:
price_map['price']=arr
else:
if desc_keys[i] in value_map:
desc_map['my_'+desc_keys[i]] = my_arr
# else:
desc_map[desc_keys[i]] = arr
return desc_map,price_map
def load_test():
##load train csv
desc_map = {}
csv_data = []
reader = csv.reader(open('./datasets/test.csv'))
for line in reader:
csv_data.append(line)
id_length = len(csv_data)-1
data = np.zeros((id_length,description_length))
for i in range(id_length):
for j in range(description_length):
key = csv_data[i+1][j+1]
for i in range(79):
arr = np.zeros(id_length)
my_arr = np.zeros(id_length)
for j in range(id_length):
key = csv_data[j+1][i+1]
key = fix_key(key)
if j == description_length:
data[i][j] = float(key)
#my map
if desc_keys[i] in value_map:
if key == 'NA':
my_arr[j] = fix_miss(desc_keys[i])
else:
my_arr[j] = Full_map['my_'+desc_keys[i]][key]
#auto map
if key in Full_map[desc_keys[i]]:
arr[j] = Full_map[desc_keys[i]][key]
else:
if key in descriptions[j]: #SalePrice
data[i][j] = float(descriptions[j][key])
else:#just num here
# print(i,j)
if key == 'NA':
key = 0;
data[i][j] = float(key)
return data
def load_all():
train_desc = load_train()[:,:79]
train_price = load_train()[:,79]
test_desc = load_test()
if key == 'NA':
arr[j] = fix_miss(desc_keys[i])
else:
arr[j] = float(key)
if desc_keys[i] in value_map:
desc_map['my_'+desc_keys[i]] = my_arr
# else:
desc_map[desc_keys[i]] = arr
return desc_map
# for i,word in enumerate(wordlist,0):
def dict2numpy(dict_data):
value_0 = list(dict_data.values())[0]
np_data = np.zeros((len(value_0),len(dict_data)))
for i,key in enumerate(dict_data.keys(),0):
np_data[:,i] = np.array(dict_data[key])
return np_data
def load_all(dimension):
desc_map,price_map = load_train()
desc_map = add_future(desc_map)
# print(len(desc_map))
# print(desc_map)
# print(desc_map)
train_price = np.array(price_map['price'])
train_desc = dict2numpy(desc_map)
desc_map = load_test()
desc_map = add_future(desc_map)
test_desc = dict2numpy(desc_map)
desc_all = np.concatenate((train_desc,test_desc),axis=0)
for i in range(description_length):
for i in range(len(desc_all[0])):
desc_all[:,i] = normlize(desc_all[:,i])
train_price = normlize(train_price)
# print(desc_all)
pca=PCA(n_components=dimension) #加载PCA算法,设置降维后主成分数目为
desc_all=pca.fit_transform(desc_all)#对样本进行降维
train_price = normlize(train_price,True)
train_desc = desc_all[:len(train_desc)]
test_desc = desc_all[len(train_desc):]
return train_desc.astype(np.float32),train_price.astype(np.float32),test_desc.astype(np.float32)
def write_csv(prices,path):
......@@ -152,9 +197,25 @@ def write_csv(prices,path):
csvFile.close()
def main():
train_desc,train_price,test_desc = load_all()
print(len(test_desc))
dimension = 80
train_desc,train_price,test_desc = load_all(dimension)
# # KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
# kr = GridSearchCV(KernelRidge(kernel='polynomial', gamma=0.1),
# param_grid={"alpha": [1e0, 0.1, 1e-2, 1e-3],
# "gamma": np.logspace(-2, 2, 5)})
# kr.fit(train_desc, train_price)
# y_kr = kr.predict(test_desc)
# for i in range(len(y_kr)):
# y_kr[i] = convert2price(y_kr[i])
# # print(y_kr.shape)
# print(dimension,evaluation.eval_test(y_kr))
# write_csv(train_price, './result.csv')
# # print(data)
......
import numpy as np
value_map = {}
value_map["MSSubClass"] = {'180':1,
'30':2, '45':2,
'190':3, '50':3, '90':3,
'85':4, '40':4, '160':4,
'70':5, '20':5, '75':5, '80':5, '150':5,
'120': 6, '60':6}
value_map["MSZoning"] = {'A':1,'C':4, 'FV':1, 'I':3,'RH':3, 'RL':2, 'RP':3, 'RM':2}
value_map["Neighborhood"] = {'MeadowV':1,
'IDOTRR':2, 'BrDale':2,
'OldTown':3, 'Edwards':3, 'BrkSide':3,
'Sawyer':4, 'Blueste':4, 'SWISU':4, 'Names':4,
'NPkVill':5, 'Mitchel':5,
'SawyerW':6, 'Gilbert':6, 'NWAmes':6,
'Blmngtn':7, 'CollgCr':7, 'ClearCr':7, 'Crawfor':7,
'Veenker':8, 'Somerst':8, 'Timber':8,
'StoneBr':9,
'NoRidge':10, 'NridgHt':10}
value_map["Condition1"] = {'Artery':1,
'Feedr':2, 'RRAe':2,
'Norm':3, 'RRAn':3,
'PosN':4, 'RRNe':4,
'PosA':5 ,'RRNn':5}
value_map["BldgType"] = {'2FmCon':1, 'Duplx':1, 'TwnhsI':1, '1Fam':2, 'TwnhsE':2}
value_map["HouseStyle"] = {'1.5Unf':1,
'1.5Fin':2, '2.5Unf':2, 'SFoyer':2,
'1Story':3, 'SLvl':3,
'2Story':4, '2.5Fin':4}
value_map["Exterior1st"] = {'BrkCmn':1,
'AsphShn':2, 'CBlock':2, 'AsbShng':2,
'WdShing':3, 'Wd Sdng':3, 'MetalSd':3, 'Stucco':3, 'HdBoard':3,'Other':3,
'BrkFace':4, 'Plywood':4, 'PreCast':4,
'VinylSd':5,
'CemntBd':6,
'Stone':7, 'ImStucc':7}
value_map["MasVnrType"] = {'BrkCmn':1, 'None':1, 'CBlock':1,'BrkFace':2, 'Stone':3}
value_map["ExterQual"] = {'Po':1,'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}
value_map["Foundation"] = {'Slab':1,
'BrkTil':2, 'CBlock':2, 'Stone':2,
'Wood':3, 'PConc':4}
value_map["BsmtQual"] = {'NA':1, 'Po':2,'Fa':3, 'TA':4, 'Gd':5, 'Ex':6}
value_map["BsmtExposure"] = {'NA':1, 'No':2, 'Av':3, 'Mn':3, 'Gd':4}
value_map["Heating"] = {'Floor':1, 'Grav':1, 'Wall':2, 'OthW':3, 'GasW':4, 'GasA':5}
value_map["HeatingQC"] = {'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}
value_map["KitchenQual"] = {'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}
value_map["Functional"] = {'Sal':1, 'Sev':2, 'Maj2':3, 'Maj1':3, 'Mod':4, 'Min2':5, 'Min1':5, 'Typ':6}
value_map["FireplaceQu"] = {'NA':1, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}
value_map["GarageType"] = {'CarPort':1, 'NA':1,
'Detchd':2,
'2Types':3, 'Basment':3,
'Attchd':4, 'BuiltIn':5}
value_map["GarageFinish"] = {'NA':1, 'Unf':2, 'RFn':3, 'Fin':4}
value_map["PavedDrive"] = {'N':1, 'P':2, 'Y':3}
value_map["SaleType"] = {'COD':1, 'ConLD':1, 'ConLI':1, 'ConLw':1, 'Oth':1, 'WD':1,
'CWD':2, 'VWD':2, 'Con':3, 'New':3}
value_map["SaleCondition"] = {'AdjLand':1, 'Abnorml':2, 'Alloca':2, 'Family':2, 'Normal':3, 'Partial':4}
def fix_key(key):
#csv is wrong here
if key == 'Wd Shng':
key='WdShing'
if key == '2fmCon':
key='2FmCon'
if key == 'NAmes':
key='Names'
if key == 'Duplex':
key='Duplx'
if key == 'CmentBd':
key='CemntBd'
if key == 'C (all)':
key='C'
if key == 'Twnhs':
key='TwnhsI'
if key == 'Brk Cmn' or key =='BrkComm':
key='BrkCmn'
else:
key = key
return key
miss_0 = ["PoolQC" , "MiscFeature", "Alley", "Fence", "FireplaceQu", "GarageQual", "GarageCond", "GarageFinish", "GarageYrBlt", "GarageType", "BsmtExposure", "BsmtCond", "BsmtQual", "BsmtFinType2", "BsmtFinType1", "MasVnrType"]
miss_1=["MasVnrArea", "BsmtUnfSF", "TotalBsmtSF", "GarageCars", "BsmtFinSF2", "BsmtFinSF1", "GarageArea"]
miss_2 = ['LotFrontage']
def fix_miss(name):
if name in miss_0:
return 1
else:
return 0
# def fix_LotFrontage(Full_map):
# a = np.zeros(25)
# for i in range(25):
# a[Full_map['Neighborhood'][i]-1] +=
def add_future(features):
features["TotalHouse"] = features["TotalBsmtSF"] + features["1stFlrSF"] + features["2ndFlrSF"]
features["TotalArea"] = features["TotalBsmtSF"] + features["1stFlrSF"] + features["2ndFlrSF"] + features["GarageArea"]
features["TotalHouse_OverallQual"] = features["TotalHouse"] * features["OverallQual"]
features["GrLivArea_OverallQual"] = features["GrLivArea"] * features["OverallQual"]
features["my_MSZoning_TotalHouse"] = features["my_MSZoning"] * features["TotalHouse"]
features["my_MSZoning_OverallQual"] = features["my_MSZoning"] + features["OverallQual"]
features["my_MSZoning_YearBuilt"] = features["my_MSZoning"] + features["YearBuilt"]
features["my_Neighborhood_TotalHouse"] = features["my_Neighborhood"] * features["TotalHouse"]
features["my_Neighborhood_OverallQual"] = features["my_Neighborhood"] + features["OverallQual"]
features["my_Neighborhood_YearBuilt"] = features["my_Neighborhood"] + features["YearBuilt"]
features["BsmtFinSF1_OverallQual"] = features["BsmtFinSF1"] * features["OverallQual"]
features["my_Functional_TotalHouse"] = features["my_Functional"] * features["TotalHouse"]
features["my_Functional_OverallQual"] = features["my_Functional"] + features["OverallQual"]
features["LotArea_OverallQual"] = features["LotArea"] * features["OverallQual"]
features["TotalHouse_LotArea"] = features["TotalHouse"] + features["LotArea"]
features["my_Condition1_TotalHouse"] = features["my_Condition1"] * features["TotalHouse"]
features["my_Condition1_OverallQual"] = features["my_Condition1"] + features["OverallQual"]
features["Bsmt"] = features["BsmtFinSF1"] + features["BsmtFinSF2"] + features["BsmtUnfSF"]
features["Rooms"] = features["FullBath"]+features["TotRmsAbvGrd"]
features["PorchArea"] = features["OpenPorchSF"]+features["EnclosedPorch"]+features["3SsnPorch"]+features["ScreenPorch"]
features["TotalPlace"] = features["TotalBsmtSF"] + features["1stFlrSF"] + features["2ndFlrSF"] + features["GarageArea"] + features["OpenPorchSF"]+features["EnclosedPorch"]+features["3SsnPorch"]+features["ScreenPorch"]
features['YrBltAndRemod']=features['YearBuilt']+features['YearRemodAdd']
features['TotalSF']=features['TotalBsmtSF'] + features['1stFlrSF'] + features['2ndFlrSF']
features['Total_sqr_footage'] = (features['BsmtFinSF1'] + features['BsmtFinSF2'] +
features['1stFlrSF'] + features['2ndFlrSF'])
features['Total_Bathrooms'] = (features['FullBath'] + (0.5 * features['HalfBath']) +
features['BsmtFullBath'] + (0.5 * features['BsmtHalfBath']))
features['Total_porch_sf'] = (features['OpenPorchSF'] + features['3SsnPorch'] +
features['EnclosedPorch'] + features['ScreenPorch'] +
features['WoodDeckSF'])
# features['haspool'] = features['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
# features['has2ndfloor'] = features['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
# features['hasgarage'] = features['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
# features['hasbsmt'] = features['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
# features['hasfireplace'] = features['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)
return features
\ No newline at end of file
......@@ -2,6 +2,7 @@ import csv
import math
import numpy as np
import dataloader
import transformer
def load_submission(path):
csv_data = []
......@@ -19,6 +20,8 @@ def eval_test(records_predict):
return RMSE(records_real, records_predict)
def RMSE(records_real,records_predict):
# records_real = np.log1p(records_real)
# records_predict = np.log1p(records_predict)
records_real = dataloader.normlize(np.array(records_real),True)
records_predict = dataloader.normlize(np.array(records_predict),True)
if len(records_real) == len(records_predict):
......@@ -28,7 +31,9 @@ def RMSE(records_real,records_predict):
return None
def main():
my_price = load_submission('./datasets/sample_submission.csv')
# my_price = load_submission('./datasets/sample_submission.csv')
my_price = load_submission('./result/0.03688_0.14435.csv')
print(eval_test(my_price))
if __name__ == '__main__':
main()
\ No newline at end of file
from sklearn.linear_model import ElasticNet, Lasso, BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
# import xgboost as xgb
# import lightgbm as lgb
import numpy as np
import torch
import dataloader
import evaluation
import time
import transformer
dimension = 85
train_desc,train_price,test_desc = dataloader.load_all(dimension)
kr = GridSearchCV(KernelRidge(kernel='polynomial'),
param_grid={"alpha": np.logspace(-3, 2, 6),
"gamma": np.logspace(-2, 2, 5)})
# print(np.logspace(-2, 2, 5))
kr.fit(train_desc, train_price)
y_kr = kr.predict(test_desc)
for i in range(len(y_kr)):
y_kr[i] = dataloader.convert2price(y_kr[i])
# print(y_kr.shape)
print(dimension,evaluation.eval_test(y_kr))
dataloader.write_csv(y_kr, './result/result.csv')
\ No newline at end of file
......@@ -19,4 +19,4 @@ class Linear(nn.Module):
x = self.relu(x)
x = self.dropout(x)
x = self.fc2(x)
return x
\ No newline at end of file
return x
此差异已折叠。
......@@ -5,7 +5,7 @@ import model
import evaluation
from torch import nn, optim
import time
import transformer
#parameter
LR = 0.0001
......@@ -14,14 +14,14 @@ BATCHSIZE = 64
CONTINUE = False
use_gpu = True
SAVE_FRE = 5
Dimension = 120
#load data
train_desc,train_price,test_desc = dataloader.load_all()
train_desc,train_price,test_desc = dataloader.load_all(Dimension)
train_desc.tolist()
train_price.tolist()
#def network
net = model.Linear(79,256,1)
net = model.Linear(Dimension,256,1)
print(net)
if CONTINUE:
......@@ -43,9 +43,13 @@ for epoch in range(EPOCHS):
price_pres = []
price_trues = []
dataloader.match_random(train_desc, train_price)
transformer.match_random(train_desc, train_price)
train_desc = np.array(train_desc)
train_price = np.array(train_price)
# train_desc = transformer.random_transform(train_desc, 0.02)
# train_price = transformer.random_transform(train_price, 0.02)
for i in range(int(len(train_desc)/BATCHSIZE)):
desc = np.zeros((BATCHSIZE,79), dtype=np.float32)
desc = np.zeros((BATCHSIZE,Dimension), dtype=np.float32)
price = np.zeros((BATCHSIZE,1), dtype=np.float32)
for j in range(BATCHSIZE):
desc[j]=train_desc[i*BATCHSIZE+j:i*BATCHSIZE+j+1]
......@@ -69,7 +73,7 @@ for epoch in range(EPOCHS):
net.eval()
price_pres = []
for i in range(len(test_desc)):
desc = (test_desc[i]).reshape(1,79)
desc = (test_desc[i]).reshape(1,Dimension)
desc = torch.from_numpy(desc).cuda()
price_pre = net(desc)
price_pres.append(dataloader.convert2price(price_pre.cpu().detach().numpy()[0][0]))
......
import numpy as np
import random
def match_random(a,b):
state = np.random.get_state()
np.random.shuffle(a)
np.random.set_state(state)
np.random.shuffle(b)
def random_transform(a,alpha):
return a*random.uniform(1-alpha,1+alpha)
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册