Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Stevezhangz
BERT Pytorch
提交
e281f154
B
BERT Pytorch
项目概览
Stevezhangz
/
BERT Pytorch
通知
14
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
B
BERT Pytorch
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
前往新版Gitcode,体验更适合开发者的 AI 搜索 >>
提交
e281f154
编写于
4月 23, 2021
作者:
Stevezhangz
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Update data_process.py
上级
aa6be9dd
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
76 addition
and
4 deletion
+76
-4
data_process.py
data_process.py
+76
-4
未找到文件。
data_process.py
浏览文件 @
e281f154
...
...
@@ -9,7 +9,7 @@ import os
import
json
import
thulac
import
numpy
as
np
import
torch
class
general_transform_text2list
:
"""
notification: All series of data process method here only support the list type sentences, so whether json or txt file
...
...
@@ -193,13 +193,79 @@ def generate_vocab_from_poem_chuci(poem_dir,map_dir):
return
sentences
,
id_sentence
,
idx2word
,
word2idx
,
vocab_size
def
creat_batch
(
batch_size
,
max_pred
,
maxlen
,
vocab_size
,
word2idx
,
token_list
,
sentences
):
def
creat_batch
(
batch_size
,
max_pred
,
maxlen
,
word2idx
,
idx2word
,
token_list
,
pre_percent
):
"""
here this mechine just have to predict several masked words. and also have to predict whether they are sequential
:param batch_size:
:param max_pred:
:param maxlen:
:param vocab_size:
:param word2idx:
:param token_list:
:param sentences:
:return: batch[In_id, seg_id, could_mask, could_mask_tok, isconnect]
"""
batch
=
[]
connect
=
unconnect
=
0
while
connect
<
batch_size
/
2
or
unconnect
<
batch_size
/
2
:
s1
=
choice
(
token_list
)
s1_index
=
token_list
.
index
(
s1
)
s2
=
choice
(
token_list
)
s2_index
=
token_list
.
index
(
s2
)
In_id
=
[
word2idx
[
'[CLS]'
]]
+
s1
+
[
word2idx
[
'[SEP]'
]]
+
s2
+
[
word2idx
[
'[SEP]'
]]
seg_id
=
[
0
]
*
(
1
+
len
(
s1
)
+
1
)
+
[
1
]
*
(
len
(
s2
)
+
1
)
could_mask
=
[]
for
seq
,
val
in
enumerate
(
In_id
):
if
idx2word
[
val
]
!=
'[CLS]'
and
idx2word
[
val
]
!=
'[SEP]'
:
could_mask
.
append
(
seq
)
mask_num
=
min
(
max_pred
,
max
(
int
(
len
(
could_mask
)
*
pre_percent
),
1
))
mask_Inid
=
np
.
random
.
choice
(
could_mask
,
int
(
mask_num
))
mask_pos
=
[]
for
mIid
in
mask_Inid
:
In_id
[
mIid
]
=
word2idx
[
'[MASK]'
]
mask_pos
.
append
(
mIid
)
pad_need
=
maxlen
-
len
(
In_id
)
In_id
.
extend
([
0
]
*
pad_need
)
seg_id
.
extend
([
0
]
*
pad_need
)
mask_Inid
=
[
i
for
i
in
mask_Inid
]
if
mask_num
<
max_pred
:
mask_Inid
.
extend
([
0
]
*
(
max_pred
-
int
(
mask_num
)))
mask_pos
.
extend
([
0
]
*
(
max_pred
-
int
(
mask_num
)))
if
s1_index
+
1
==
s2_index
and
connect
<
batch_size
/
2
:
connect
+=
1
batch
.
append
([
In_id
,
seg_id
,
mask_pos
,
mask_Inid
,
True
])
if
s1_index
+
1
!=
s2_index
and
unconnect
<
batch_size
/
2
:
unconnect
+=
1
batch
.
append
([
In_id
,
seg_id
,
mask_Inid
,
mask_pos
,
False
])
return
batch
def
creat_batch_demo
(
batch_size
,
max_pred
,
maxlen
,
vocab_size
,
word2idx
,
token_list
,
sentences
):
"""
this demo could be found, thanks: https://codechina.csdn.net/mirrors/wmathor/nlp-tutorial/-/tree/master/5-2.BERT
:param batch_size:
:param max_pred:
:param maxlen:
:param vocab_size:
:param word2idx:
:param token_list:
:param sentences:
:return:batch
"""
batch
=
[]
positive
=
negative
=
0
while
positive
!=
batch_size
/
2
or
negative
!=
batch_size
/
2
:
tokens_a_index
,
tokens_b_index
=
randrange
(
len
(
sentences
)),
randrange
(
len
(
sentences
))
# random choice two sentences
tokens_a
,
tokens_b
=
token_list
[
tokens_a_index
],
token_list
[
tokens_b_index
]
input_ids
=
[
word2idx
[
'[CLS]'
]]
+
tokens_a
+
[
word2idx
[
'[SEP]'
]]
+
tokens_b
+
[
word2idx
[
'[SEP]'
]]
segment_ids
=
[
0
]
*
(
1
+
len
(
tokens_a
)
+
1
)
+
[
1
]
*
(
len
(
tokens_b
)
+
1
)
n_pred
=
min
(
max_pred
,
max
(
1
,
int
(
len
(
input_ids
)
*
0.15
)))
...
...
@@ -234,7 +300,13 @@ def creat_batch(batch_size,max_pred,maxlen,vocab_size,word2idx,token_list,senten
class
Text_file
(
Data
.
Dataset
):
def
__init__
(
self
,
input_ids
,
segment_ids
,
masked_tokens
,
masked_pos
,
isNext
):
def
__init__
(
self
,
batch
):
input_ids
,
segment_ids
,
masked_tokens
,
masked_pos
,
isNext
=
zip
(
*
batch
)
input_ids
,
segment_ids
,
masked_tokens
,
masked_pos
,
isNext
=
torch
.
LongTensor
(
input_ids
),
\
torch
.
LongTensor
(
segment_ids
),
\
torch
.
LongTensor
(
masked_tokens
),
\
torch
.
LongTensor
(
masked_pos
),
\
torch
.
LongTensor
(
isNext
)
self
.
input_ids
=
input_ids
self
.
segment_ids
=
segment_ids
self
.
masked_tokens
=
masked_tokens
...
...
@@ -246,4 +318,4 @@ class Text_file(Data.Dataset):
def
__getitem__
(
self
,
idx
):
return
self
.
input_ids
[
idx
],
self
.
segment_ids
[
idx
],
self
.
masked_tokens
[
idx
],
self
.
masked_pos
[
idx
],
self
.
isNext
[
idx
]
\ No newline at end of file
idx
]
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录