Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Oneflow-Inc
oneflow
提交
c24059bd
O
oneflow
项目概览
Oneflow-Inc
/
oneflow
上一次同步 2 年多
通知
13
Star
2733
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
O
oneflow
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
前往新版Gitcode,体验更适合开发者的 AI 搜索 >>
提交
c24059bd
编写于
10月 18, 2017
作者:
W
willzhang4a58
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix bug : append to record
上级
6db713e5
变更
13
隐藏空白更改
内联
并排
Showing
13 changed file
with
46 addition
and
49 deletion
+46
-49
examples/mnist/predict/job.prototxt
examples/mnist/predict/job.prototxt
+2
-4
examples/mnist/predict/net.prototxt
examples/mnist/predict/net.prototxt
+1
-9
examples/mnist/train/job.prototxt
examples/mnist/train/job.prototxt
+4
-8
examples/mnist/train/net.prototxt
examples/mnist/train/net.prototxt
+1
-7
examples/mnist/train/placement.prototxt
examples/mnist/train/placement.prototxt
+1
-9
oneflow/core/job/runtime_context.cpp
oneflow/core/job/runtime_context.cpp
+12
-0
oneflow/core/job/runtime_context.h
oneflow/core/job/runtime_context.h
+5
-0
oneflow/core/kernel/record_kernel.cpp
oneflow/core/kernel/record_kernel.cpp
+7
-6
oneflow/core/persistence/file_system.cpp
oneflow/core/persistence/file_system.cpp
+5
-0
oneflow/core/persistence/file_system.h
oneflow/core/persistence/file_system.h
+2
-0
oneflow/core/persistence/persistent_out_stream.cpp
oneflow/core/persistence/persistent_out_stream.cpp
+2
-0
oneflow/core/persistence/persistent_out_stream.h
oneflow/core/persistence/persistent_out_stream.h
+2
-0
oneflow/core/persistence/snapshot_manager.cpp
oneflow/core/persistence/snapshot_manager.cpp
+2
-6
未找到文件。
examples/mnist/predict/job.prototxt
浏览文件 @
c24059bd
dlnet_filepath: "./net.prototxt"
resource_filepath: "./resource.prototxt"
placement_filepath: "./placement.prototxt"
model_load_snapshot_path: ""
model_load_snapshot_path: "
/willzhang/snapshots/snapshot_10
"
piece_size: 1000
default_data_type: kFloat
use_async_cpu_stream: false
max_data_id_length: 0
max_data_id_length: 16
global_fs_conf {
hdfs_conf {
namenode: "hdfs://192.168.1.11:9000"
...
...
examples/mnist/predict/net.prototxt
浏览文件 @
c24059bd
...
...
@@ -27,31 +27,23 @@ op {
op {
name: "conv"
model_load_dir: "/willzhang/snapshots/snapshot_3/conv"
convolution_conf {
in: "feature/out"
out: "out"
out_num: 1
has_bias_term: true
pad_h: 0
pad_w: 0
kernel_h: 5
kernel_w: 5
stride_h: 1
stride_w: 1
dilation_h: 1
dilation_w: 1
}
}
op {
name: "ip10"
model_load_dir: "/willzhang/snapshots/snapshot_3/ip10"
innerproduct_conf {
in: "conv/out"
out: "out"
out_num: 10
has_bias_term:
fals
e
has_bias_term:
tru
e
}
}
...
...
examples/mnist/train/job.prototxt
浏览文件 @
c24059bd
dlnet_filepath: "./net.prototxt"
resource_filepath: "./resource.prototxt"
placement_filepath: "./placement.prototxt"
model_load_snapshot_path: ""
piece_size: 1000
default_data_type: kFloat
use_async_cpu_stream: false
piece_size: 100
global_fs_conf {
hdfs_conf {
namenode: "hdfs://192.168.1.11:9000"
...
...
@@ -13,16 +10,15 @@ global_fs_conf {
train_conf {
num_of_pieces_in_batch: 10
model_save_snapshots_path: "/willzhang/snapshots"
num_of_batches_in_snapshot: 6
staleness: 0
total_batch_num: 15
num_of_batches_in_snapshot: 60
total_batch_num: 600
default_fill_conf {
gaussian_conf {
mean: 0.0
std: 0.1
}
}
piece_num_of_record_loss: 10
piece_num_of_record_loss: 10
0
normal_mdupdt_conf {
learning_rate: 0.01
}
...
...
examples/mnist/train/net.prototxt
浏览文件 @
c24059bd
...
...
@@ -32,14 +32,8 @@ op {
out: "out"
out_num: 1
has_bias_term: true
pad_h: 0
pad_w: 0
kernel_h: 5
kernel_w: 5
stride_h: 1
stride_w: 1
dilation_h: 1
dilation_w: 1
}
}
...
...
@@ -49,7 +43,7 @@ op {
in: "conv/out"
out: "out"
out_num: 10
has_bias_term:
fals
e
has_bias_term:
tru
e
}
}
...
...
examples/mnist/train/placement.prototxt
浏览文件 @
c24059bd
...
...
@@ -14,19 +14,11 @@ placement_group {
op_set {
op_name: "conv"
op_name: "ip10"
}
parallel_conf {
policy: kDataParallel
device_name: "192.168.1.11:0-3"
}
}
placement_group {
op_set {
op_name: "softmax_loss"
}
parallel_conf {
policy: kDataParallel
device_name: "192.168.1.11:0-3"
device_name: "192.168.1.13:0-3"
}
}
oneflow/core/job/runtime_context.cpp
浏览文件 @
c24059bd
...
...
@@ -7,6 +7,18 @@ std::string RuntimeCtx::GetCtrlAddr(int64_t machine_id) const {
return
mchn
.
addr
()
+
":"
+
std
::
to_string
(
mchn
.
port
());
}
PersistentOutStream
*
RuntimeCtx
::
GetPersistentOutStream
(
const
std
::
string
&
filepath
)
{
auto
iter
=
filepath2ostream_
.
find
(
filepath
);
if
(
iter
!=
filepath2ostream_
.
end
())
{
return
iter
->
second
.
get
();
}
else
{
auto
ostream_ptr
=
new
PersistentOutStream
(
GlobalFS
(),
filepath
);
filepath2ostream_
[
filepath
].
reset
(
ostream_ptr
);
return
ostream_ptr
;
}
}
RuntimeCtx
::
RuntimeCtx
(
const
std
::
string
&
name
)
{
this_machine_id_
=
IDMgr
::
Singleton
()
->
MachineID4MachineName
(
name
);
LOG
(
INFO
)
<<
"this machine name: "
<<
name
;
...
...
oneflow/core/job/runtime_context.h
浏览文件 @
c24059bd
...
...
@@ -4,6 +4,7 @@
#include "oneflow/core/common/blocking_counter.h"
#include "oneflow/core/job/id_manager.h"
#include "oneflow/core/persistence/persistent_in_stream.h"
#include "oneflow/core/persistence/persistent_out_stream.h"
namespace
oneflow
{
...
...
@@ -25,6 +26,8 @@ class RuntimeCtx final {
BlockingCounter
&
mut_active_actor_cnt
()
{
return
active_actor_cnt_
;
}
BlockingCounter
&
mut_inactive_actor_cnt
()
{
return
inactive_actor_cnt_
;
}
PersistentOutStream
*
GetPersistentOutStream
(
const
std
::
string
&
filepath
);
private:
RuntimeCtx
(
const
std
::
string
&
name
);
...
...
@@ -34,6 +37,8 @@ class RuntimeCtx final {
BlockingCounter
active_actor_cnt_
;
BlockingCounter
inactive_actor_cnt_
;
HashMap
<
std
::
string
,
std
::
unique_ptr
<
PersistentOutStream
>>
filepath2ostream_
;
};
}
// namespace oneflow
...
...
oneflow/core/kernel/record_kernel.cpp
浏览文件 @
c24059bd
#include "oneflow/core/kernel/record_kernel.h"
#include "oneflow/core/job/runtime_context.h"
namespace
oneflow
{
...
...
@@ -7,8 +8,6 @@ namespace {
template
<
typename
T
>
void
RecordBlobImpl
(
PersistentOutStream
&
out_stream
,
const
Blob
*
blob
)
{
CHECK_EQ
(
GetDataType
<
T
>::
val
,
blob
->
data_type
());
blob
->
shape
().
SerializeWithTextFormat
(
out_stream
);
out_stream
<<
'\n'
;
const
T
*
dptr
=
blob
->
dptr
<
T
>
();
for
(
int64_t
i
=
0
;
i
<
blob
->
shape
().
At
(
0
);
++
i
)
{
if
(
blob
->
has_data_id
())
{
...
...
@@ -41,7 +40,7 @@ void RecordKernel::Forward(
std
::
function
<
Blob
*
(
const
std
::
string
&
)
>
BnInOp2Blob
)
const
{
int64_t
parallel_id
=
reinterpret_cast
<
int64_t
>
(
kernel_ctx
.
other
);
const
std
::
string
&
root_path
=
op
()
->
op_conf
().
record_conf
().
record_path
();
OF_CALL_ONCE
(
root_path
,
GlobalFS
()
->
CreateDirIfNotExist
(
root_path
));
OF_CALL_ONCE
(
root_path
,
GlobalFS
()
->
MakeEmptyDir
(
root_path
));
for
(
const
std
::
string
&
ibn
:
op
()
->
input_bns
())
{
const
std
::
string
&
lbn
=
op
()
->
Lbn4BnInOp
(
ibn
);
const
Blob
*
blob
=
BnInOp2Blob
(
ibn
);
...
...
@@ -55,9 +54,11 @@ void RecordKernel::Forward(
std
::
string
bn_in_op_dir
=
JoinPath
(
op_dir
,
bn_in_op
);
OF_CALL_ONCE
(
bn_in_op_dir
,
GlobalFS
()
->
CreateDir
(
bn_in_op_dir
));
std
::
string
file_path
=
JoinPath
(
bn_in_op_dir
,
"part_"
+
std
::
to_string
(
parallel_id
));
PersistentOutStream
out_stream
(
GlobalFS
(),
file_path
);
RecordBlob
(
out_stream
,
blob
);
JoinPath
(
bn_in_op_dir
,
"part-"
+
std
::
to_string
(
parallel_id
));
auto
out_stream
=
RuntimeCtx
::
Singleton
()
->
GetPersistentOutStream
(
file_path
);
RecordBlob
(
*
out_stream
,
blob
);
out_stream
->
Flush
();
});
}
}
...
...
oneflow/core/persistence/file_system.cpp
浏览文件 @
c24059bd
...
...
@@ -23,6 +23,11 @@ std::string FileSystem::TranslateName(const std::string& name) const {
return
CleanPath
(
name
);
}
void
FileSystem
::
MakeEmptyDir
(
const
std
::
string
&
dirname
)
{
if
(
IsDirectory
(
dirname
))
{
RecursivelyDeleteDir
(
dirname
);
}
CreateDir
(
dirname
);
}
void
FileSystem
::
RecursivelyDeleteDir
(
const
std
::
string
&
dirname
)
{
CHECK
(
FileExists
(
dirname
));
std
::
deque
<
std
::
string
>
dir_q
;
// Queue for the BFS
...
...
oneflow/core/persistence/file_system.h
浏览文件 @
c24059bd
...
...
@@ -124,6 +124,8 @@ class FileSystem {
// subdirectories.
virtual
void
RecursivelyCreateDir
(
const
std
::
string
&
dirname
);
void
MakeEmptyDir
(
const
std
::
string
&
dirname
);
// Deletes the specified directory.
virtual
void
DeleteDir
(
const
std
::
string
&
dirname
)
=
0
;
...
...
oneflow/core/persistence/persistent_out_stream.cpp
浏览文件 @
c24059bd
...
...
@@ -14,4 +14,6 @@ PersistentOutStream& PersistentOutStream::Write(const char* s, size_t n) {
return
*
this
;
}
void
PersistentOutStream
::
Flush
()
{
file_
->
Flush
();
}
}
// namespace oneflow
oneflow/core/persistence/persistent_out_stream.h
浏览文件 @
c24059bd
...
...
@@ -18,6 +18,8 @@ class PersistentOutStream final {
// Inserts the first n characters of the array pointed by s into the stream.
PersistentOutStream
&
Write
(
const
char
*
s
,
size_t
n
);
void
Flush
();
private:
std
::
unique_ptr
<
fs
::
WritableFile
>
file_
;
};
...
...
oneflow/core/persistence/snapshot_manager.cpp
浏览文件 @
c24059bd
...
...
@@ -9,12 +9,8 @@ SnapshotMgr::SnapshotMgr(const Plan& plan) {
num_of_model_blobs_
=
0
;
if
(
JobDesc
::
Singleton
()
->
is_train
())
{
model_save_snapshots_path_
=
JobDesc
::
Singleton
()
->
md_save_snapshots_path
();
OF_CALL_ONCE
(
model_save_snapshots_path_
,
{
if
(
GlobalFS
()
->
IsDirectory
(
model_save_snapshots_path_
))
{
GlobalFS
()
->
RecursivelyDeleteDir
(
model_save_snapshots_path_
);
}
GlobalFS
()
->
CreateDir
(
model_save_snapshots_path_
);
});
OF_CALL_ONCE
(
model_save_snapshots_path_
,
GlobalFS
()
->
MakeEmptyDir
(
model_save_snapshots_path_
));
HashSet
<
std
::
string
>
model_blob_set
;
for
(
const
OperatorProto
&
op_proto
:
plan
.
op
())
{
if
(
op_proto
.
op_conf
().
has_model_save_conf
())
{
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录