dataloader.py 7.0 KB
Newer Older
H
0.15960  
hypox64 已提交
1 2 3
import os
import csv
import numpy as np
H
0.14142  
hypox64 已提交
4
import random
H
hypox64 已提交
5 6 7 8 9 10
from sklearn.decomposition import PCA 
from sklearn.model_selection import GridSearchCV
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR
import evaluation
from description_map import value_map,fix_key,fix_miss,add_future
H
0.15960  
hypox64 已提交
11 12 13 14 15 16 17 18 19 20 21 22 23

# load description_txt
description_txt = []
colon_indexs = []
for i,line in enumerate(open('./datasets/data_description.txt'),0):
    line = line.strip()
    if(':' in line[0:20]):
        colon_indexs.append(i)
    description_txt.append(line)
colon_indexs.append(524)#the end of description
description_length = len(colon_indexs)-1


H
hypox64 已提交
24 25
Full_map = {}
desc_keys = []
H
0.15960  
hypox64 已提交
26
for i in range(len(colon_indexs)-1):
H
hypox64 已提交
27 28 29 30 31 32
    ori_map = {}
    my_map = {}
    desc_key = description_txt[colon_indexs[i]]
    desc_key = desc_key[:desc_key.find(':')]
    desc_keys.append(desc_key)
    # print(desc_key)
H
0.15960  
hypox64 已提交
33
    interspace = colon_indexs[i+1]-colon_indexs[i]-2 #two space line
H
hypox64 已提交
34

H
0.15960  
hypox64 已提交
35
    if interspace == 0:
H
hypox64 已提交
36 37
        ori_map['Just_num'] = 'None'
        Full_map[desc_key]=ori_map
H
0.15960  
hypox64 已提交
38 39 40
    else:
        for j in range(interspace-1): #del low space line
            line = description_txt[colon_indexs[i]+j+2]
H
hypox64 已提交
41
            key = line[:line.find('\t')]
H
0.15960  
hypox64 已提交
42
            #data_description.txt is wrong here
H
hypox64 已提交
43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61
            if key == 'NA ':
                key = 'NA'
            if key == 'WD ':
                key = 'WD'
            if key == 'BrkComm' or key =='Brk Cmn':
                key = 'BrkCmn'
            if desc_key in value_map:
                my_map[key] = value_map[desc_key][key]
                Full_map['my_'+desc_key]=my_map
            ori_map[key] = interspace-j-1 #change word to vector
            Full_map[desc_key]=ori_map

# def normlize(npdata,justprice = False):
#     _mean = np.mean(npdata)
#     _std = np.std(npdata)
#     if justprice:       
#         _mean = 180921.195
#         _std = 79415.2918
#     return (npdata-_mean)/_std
H
0.15960  
hypox64 已提交
62

H
0.14435  
hypox64 已提交
63
def normlize(npdata,justprice = False):
H
0.15960  
hypox64 已提交
64 65
    _min = np.min(npdata)
    _max = np.max(npdata)
H
0.14435  
hypox64 已提交
66 67 68
    if justprice:       
        _min = 34900.0
        _max = 755000.0
H
0.15960  
hypox64 已提交
69 70
    return (npdata-_min)/(_max-_min)

H
hypox64 已提交
71 72 73
# def convert2price(tensor):
#     return tensor*79415.2918+180921.195

H
0.15960  
hypox64 已提交
74 75 76 77
def convert2price(tensor):
    return tensor*(755000.0-34900.0)+34900

def load_train():
H
hypox64 已提交
78 79 80
    
    desc_map = {}
    price_map = {}
H
0.15960  
hypox64 已提交
81
    csv_data = []
H
hypox64 已提交
82
    #train_del_1299_524.csv
H
0.15960  
hypox64 已提交
83 84 85 86 87
    reader = csv.reader(open('./datasets/train.csv'))
    for line in reader:
        csv_data.append(line)
    id_length = len(csv_data)-1

H
hypox64 已提交
88 89 90 91 92
    for i in range(80):
        arr = np.zeros(id_length)
        my_arr = np.zeros(id_length)
        for j in range(id_length):
            key = csv_data[j+1][i+1]
H
0.15960  
hypox64 已提交
93
            key = fix_key(key)
H
hypox64 已提交
94 95
            if i == 79:
                arr[j] = float(key)
H
0.15960  
hypox64 已提交
96
            else:
H
hypox64 已提交
97 98 99 100 101 102 103 104 105 106
                #my map
                if desc_keys[i] in value_map:
                    if key == 'NA':
                        my_arr[j] = fix_miss(desc_keys[i])
                    else:
                        my_arr[j] = Full_map['my_'+desc_keys[i]][key]
                #auto map
                if key in Full_map[desc_keys[i]]:
                    arr[j] = Full_map[desc_keys[i]][key]
                else:
H
0.15960  
hypox64 已提交
107
                    if key == 'NA':
H
hypox64 已提交
108 109 110 111 112 113 114 115 116 117 118
                        arr[j] = fix_miss(desc_keys[i])
                    else:
                        arr[j] = float(key)
        if i == 79:        
            price_map['price']=arr
        else:
            if desc_keys[i] in value_map:
                desc_map['my_'+desc_keys[i]] = my_arr
            # else:
            desc_map[desc_keys[i]] = arr
    return desc_map,price_map
H
0.15960  
hypox64 已提交
119 120

def load_test():
H
hypox64 已提交
121
    desc_map = {}
H
0.15960  
hypox64 已提交
122 123 124 125 126 127
    csv_data = []
    reader = csv.reader(open('./datasets/test.csv'))
    for line in reader:
        csv_data.append(line)
    id_length = len(csv_data)-1

H
hypox64 已提交
128 129 130 131 132
    for i in range(79):
        arr = np.zeros(id_length)
        my_arr = np.zeros(id_length)
        for j in range(id_length):
            key = csv_data[j+1][i+1]
H
0.15960  
hypox64 已提交
133
            key = fix_key(key)
H
hypox64 已提交
134 135 136 137 138 139 140 141 142 143 144

            #my map
            if desc_keys[i] in value_map:
                if key == 'NA':
                    my_arr[j] = fix_miss(desc_keys[i])
                else:
                    my_arr[j] = Full_map['my_'+desc_keys[i]][key]

            #auto map
            if key in Full_map[desc_keys[i]]:
                arr[j] = Full_map[desc_keys[i]][key]
H
0.15960  
hypox64 已提交
145
            else:
H
hypox64 已提交
146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177
                if key == 'NA':
                    arr[j] = fix_miss(desc_keys[i])
                else:
                    arr[j] = float(key)

        if desc_keys[i] in value_map:
            desc_map['my_'+desc_keys[i]] = my_arr
        # else:
        desc_map[desc_keys[i]] = arr
    return desc_map
# for i,word in enumerate(wordlist,0):

def dict2numpy(dict_data):
    value_0 = list(dict_data.values())[0]
    np_data = np.zeros((len(value_0),len(dict_data)))
    for i,key in  enumerate(dict_data.keys(),0):
        np_data[:,i] = np.array(dict_data[key])
    return np_data

def load_all(dimension):
    desc_map,price_map = load_train()
    desc_map = add_future(desc_map)
    # print(len(desc_map))
    # print(desc_map)
    # print(desc_map)
    train_price = np.array(price_map['price'])
    train_desc = dict2numpy(desc_map)

    desc_map = load_test()
    desc_map = add_future(desc_map)
    test_desc = dict2numpy(desc_map)

H
0.15960  
hypox64 已提交
178
    desc_all = np.concatenate((train_desc,test_desc),axis=0)
H
hypox64 已提交
179
    for i in range(len(desc_all[0])):
H
0.15960  
hypox64 已提交
180
        desc_all[:,i] = normlize(desc_all[:,i])
H
hypox64 已提交
181 182 183 184 185
    # print(desc_all)
    pca=PCA(n_components=dimension)     #加载PCA算法,设置降维后主成分数目为
    desc_all=pca.fit_transform(desc_all)#对样本进行降维

    train_price = normlize(train_price,True)
H
0.14435  
hypox64 已提交
186 187
    train_desc = desc_all[:len(train_desc)]
    test_desc = desc_all[len(train_desc):]
H
hypox64 已提交
188

H
0.15960  
hypox64 已提交
189 190 191 192 193 194 195 196 197 198 199
    return train_desc.astype(np.float32),train_price.astype(np.float32),test_desc.astype(np.float32)

def write_csv(prices,path):
    csvFile = open(path, "w",newline='')  
    writer = csv.writer(csvFile)          
    writer.writerow(["Id","SalePrice"])
    for i in range(len(prices)):
        writer.writerow([str(i+1461),prices[i]])
    csvFile.close()

def main():
H
0.14435  
hypox64 已提交
200

H
hypox64 已提交
201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218
    dimension = 80


    train_desc,train_price,test_desc = load_all(dimension)

    # # KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)

    # kr = GridSearchCV(KernelRidge(kernel='polynomial', gamma=0.1),
    #                   param_grid={"alpha": [1e0, 0.1, 1e-2, 1e-3],
    #                               "gamma": np.logspace(-2, 2, 5)})


    # kr.fit(train_desc, train_price)
    # y_kr = kr.predict(test_desc)
    # for i in range(len(y_kr)):
    #     y_kr[i] = convert2price(y_kr[i])
    # # print(y_kr.shape)
    # print(dimension,evaluation.eval_test(y_kr))
H
0.14435  
hypox64 已提交
219 220 221

    # write_csv(train_price, './result.csv')
    # # print(data)
H
0.15960  
hypox64 已提交
222 223 224 225 226
    # plt.plot(data[1])
    # plt.show()
if __name__ == '__main__':
    main()