提交 8f938924 编写于 作者: G gongweibao 提交者: GitHub

Merge pull request #197 from gongweibao/cleanfitaline

Cleanfitaline
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from collections import Counter
from urllib2 import urlopen
import argparse
import os
import random
import logging
import numpy as np
logging.basicConfig(level=logging.INFO)
data_url = 'https://archive.ics.uci.edu/ml/machine' \
'-learning-databases/housing/housing.data'
raw_data = 'housing.data'
train_data = 'housing.train.npy'
test_data = 'housing.test.npy'
feature_names = [
'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
'PTRATIO', 'B', 'LSTAT'
]
root_dir = os.path.abspath(os.pardir)
def maybe_download(url, file_path):
if not os.path.exists(file_path):
logging.info('data doesn\'t exist on %s, download from [%s]' %
(file_path, url))
resp = urlopen(url).read()
with open(file_path, 'w') as f:
f.write(resp)
logging.info('got raw housing data')
def save_list():
with open('train.list', 'w') as f:
f.write('data/' + train_data + '\n')
with open('test.list', 'w') as f:
f.write('data/' + test_data + '\n')
def feature_range(maximums, minimums):
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
feature_num = len(maximums)
ax.bar(range(feature_num), maximums - minimums, color='r', align='center')
ax.set_title('feature scale')
plt.xticks(range(feature_num), feature_names)
plt.xlim([-1, feature_num])
fig.set_figheight(6)
fig.set_figwidth(10)
fig.savefig('%s/image/ranges.png' % root_dir, dpi=48)
plt.close(fig)
def preprocess(file_path, feature_num=14, shuffle=False, ratio=0.8):
data = np.fromfile(file_path, sep=' ')
data = data.reshape(data.shape[0] / feature_num, feature_num)
maximums, minimums, avgs = data.max(axis=0), data.min(axis=0), data.sum(
axis=0) / data.shape[0]
feature_range(maximums[:-1], minimums[:-1])
for i in xrange(feature_num - 1):
data[:, i] = (data[:, i] - avgs[i]) / (maximums[i] - minimums[i])
if shuffle:
np.random.shuffle(data)
offset = int(data.shape[0] * ratio)
np.save(train_data, data[:offset])
logging.info('saved training data to %s' % train_data)
np.save(test_data, data[offset:])
logging.info('saved test data to %s' % test_data)
save_list()
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='download boston housing price data set and preprocess the data(normalization and split dataset)'
)
parser.add_argument(
'-r',
'--ratio',
dest='ratio',
default='0.8',
help='ratio of data used for training')
parser.add_argument(
'-s',
'--shuffle',
dest='shuffle',
default='0',
choices={'1', '0'},
help='shuffle the data before splitting, 1=shuffle, 0=do not shuffle')
args = parser.parse_args()
maybe_download(data_url, raw_data)
preprocess(raw_data, shuffle=int(args.shuffle), ratio=float(args.ratio))
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer.PyDataProvider2 import *
import numpy as np
# define data types of input
@provider(input_types=[dense_vector(13), dense_vector(1)])
def process(settings, input_file):
data = np.load(input_file.strip())
for row in data:
yield row[:-1].tolist(), row[-1:].tolist()
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import argparse
import numpy as np
from py_paddle import swig_paddle, DataProviderConverter
from paddle.trainer.PyDataProvider2 import *
from paddle.trainer.config_parser import parse_config
logging.basicConfig(level=logging.INFO)
def predict(input_file, model_dir):
# prepare PaddlePaddle environment, load models
swig_paddle.initPaddle("--use_gpu=0")
conf = parse_config('trainer_config.py', 'is_predict=1')
network = swig_paddle.GradientMachine.createFromConfigProto(
conf.model_config)
network.loadParameters(model_dir)
slots = [dense_vector(13)]
converter = DataProviderConverter(slots)
data = np.load(input_file)
ys = []
for row in data:
result = network.forwardTest(converter([[row[:-1].tolist()]]))
y_true = row[-1:].tolist()[0]
y_predict = result[0]['value'][0][0]
ys.append([y_true, y_predict])
ys = np.matrix(ys)
avg_err = np.average(np.square((ys[:, 0] - ys[:, 1])))
logging.info('MSE of test set is %f' % avg_err)
# draw a scatter plot
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
ax.scatter(ys[:, 0], ys[:, 1])
y_range = [ys[:, 0].min(), ys[:, 0].max()]
ax.plot(y_range, y_range, 'k--', lw=4)
ax.set_xlabel('True ($1000)')
ax.set_ylabel('Predicted ($1000)')
ax.set_title('Predictions on boston housing price')
fig.savefig('image/predictions.png', dpi=60)
plt.close(fig)
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='predict house price and save the result as image.')
parser.add_argument(
'-m',
'--model',
dest='model',
default='output/pass-00029',
help='model path')
parser.add_argument(
'-t',
'--test_data',
dest='test_data',
default='data/housing.test.npy',
help='test data path')
args = parser.parse_args()
predict(input_file=args.test_data, model_dir=args.model)
#!/bin/bash
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
paddle train --config=trainer_config.py --save_dir=./output --num_passes=30
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
is_predict = get_config_arg('is_predict', bool, False)
# 1. read data
define_py_data_sources2(
train_list='data/train.list',
test_list='data/test.list',
module='dataprovider',
obj='process')
# 2. learning algorithm
settings(batch_size=2)
# 3. Network configuration
x = data_layer(name='x', size=13)
y_predict = fc_layer(
input=x,
param_attr=ParamAttr(name='w'),
size=1,
act=LinearActivation(),
bias_attr=ParamAttr(name='b'))
if not is_predict:
y = data_layer(name='y', size=1)
cost = regression_cost(input=y_predict, label=y)
outputs(cost)
else:
outputs(y_predict)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册