Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
小小明-代码实体
python_gui
提交
b1e51542
P
python_gui
项目概览
小小明-代码实体
/
python_gui
通知
199
Star
11
Fork
2
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
python_gui
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
前往新版Gitcode,体验更适合开发者的 AI 搜索 >>
提交
b1e51542
编写于
11月 26, 2021
作者:
小小明-代码实体
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
PDF文本提取源码
上级
e885c3d2
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
199 addition
and
0 deletion
+199
-0
pdf_reader.py
pdf_reader.py
+199
-0
未找到文件。
pdf_reader.py
0 → 100644
浏览文件 @
b1e51542
"""
小小明的代码
CSDN主页:https://blog.csdn.net/as604049322
"""
__author__
=
'小小明'
__time__
=
'2021/11/24'
import
csv
import
wx
import
os
import
fitz
class
MyCanvas
(
wx
.
Panel
):
def
__init__
(
self
,
parent
):
wx
.
Panel
.
__init__
(
self
,
parent
)
self
.
parent
=
parent
self
.
rects
=
[]
self
.
Bind
(
wx
.
EVT_LEFT_DOWN
,
self
.
OnLeftButtonEvent
)
self
.
Bind
(
wx
.
EVT_LEFT_UP
,
self
.
OnLeftButtonEvent
)
self
.
Bind
(
wx
.
EVT_MOTION
,
self
.
OnLeftButtonEvent
)
self
.
Bind
(
wx
.
EVT_PAINT
,
self
.
DoDrawing
)
b
=
wx
.
Button
(
self
,
-
1
,
"打开文件"
,
(
0
,
0
))
self
.
Bind
(
wx
.
EVT_BUTTON
,
self
.
OnButton
,
b
)
b
=
wx
.
Button
(
self
,
-
1
,
"保存文件"
,
(
75
,
0
))
self
.
Bind
(
wx
.
EVT_BUTTON
,
self
.
save_file
,
b
)
b
=
wx
.
Button
(
self
,
-
1
,
"保存图片"
,
(
150
,
0
))
self
.
Bind
(
wx
.
EVT_BUTTON
,
self
.
save_img
,
b
)
b
=
wx
.
Button
(
self
,
-
1
,
"撤销选区"
,
(
225
,
0
))
self
.
Bind
(
wx
.
EVT_BUTTON
,
self
.
back_select
,
b
)
b
=
wx
.
Button
(
self
,
-
1
,
"《"
,
(
300
,
0
),
size
=
(
25
,
25
))
self
.
Bind
(
wx
.
EVT_BUTTON
,
self
.
previous
,
b
)
b
=
wx
.
Button
(
self
,
-
1
,
"》"
,
(
325
,
0
),
size
=
(
25
,
25
))
self
.
Bind
(
wx
.
EVT_BUTTON
,
self
.
next
,
b
)
self
.
g1
=
wx
.
Gauge
(
self
,
-
1
,
100
,
(
0
,
30
),
(
-
1
,
100
),
wx
.
GA_VERTICAL
)
def
previous
(
self
,
evt
):
if
not
hasattr
(
self
,
"pdfDoc"
):
return
if
self
.
i
>
0
:
self
.
i
-=
1
self
.
change_pdf_page
(
self
.
i
,
False
)
self
.
DoDrawing
(
-
1
)
if
self
.
rects
:
self
.
parent
.
SetTitle
(
self
.
path
+
"|"
+
self
.
extract_pdf_text
())
def
next
(
self
,
evt
):
if
not
hasattr
(
self
,
"pdfDoc"
):
return
if
self
.
i
<
self
.
pageCount
-
1
:
self
.
i
+=
1
self
.
change_pdf_page
(
self
.
i
,
False
)
self
.
DoDrawing
(
-
1
)
if
self
.
rects
:
self
.
parent
.
SetTitle
(
self
.
path
+
"|"
+
self
.
extract_pdf_text
())
def
back_select
(
self
,
evt
):
if
self
.
rects
:
self
.
rects
.
pop
()
self
.
DoDrawing
(
-
1
)
def
OnButton
(
self
,
evt
):
dlg
=
wx
.
FileDialog
(
self
,
message
=
"选择一个PDF文件"
,
defaultDir
=
os
.
getcwd
(),
defaultFile
=
""
,
wildcard
=
"PDF文件(*.pdf)|*.pdf"
,
style
=
wx
.
FD_OPEN
|
wx
.
FD_CHANGE_DIR
|
wx
.
FD_FILE_MUST_EXIST
|
wx
.
FD_PREVIEW
)
if
dlg
.
ShowModal
()
==
wx
.
ID_OK
:
self
.
rects
=
[]
path
=
dlg
.
GetPath
()
self
.
pdfDoc
=
fitz
.
open
(
path
)
self
.
i
=
0
self
.
pageCount
=
self
.
pdfDoc
.
pageCount
self
.
change_pdf_page
(
self
.
i
)
self
.
path
=
os
.
path
.
basename
(
path
)
self
.
parent
.
SetTitle
(
self
.
path
)
self
.
DoDrawing
(
-
1
)
dlg
.
Destroy
()
def
change_pdf_page
(
self
,
i
,
move
=
True
):
page
=
self
.
pdfDoc
[
i
]
rect
=
page
.
rect
print
(
"pdf范围:"
,
rect
)
mat
=
fitz
.
Matrix
(
1
,
1
)
pix
=
page
.
get_pixmap
(
matrix
=
mat
,
alpha
=
False
,
clip
=
rect
)
pix
.
save
(
"tmp.png"
)
self
.
change_img
(
"tmp.png"
,
move
)
def
save_FileDialog
(
self
,
format
=
"csv"
):
dlg
=
wx
.
FileDialog
(
self
,
message
=
f
"保存一个
{
format
}
文件"
,
defaultDir
=
os
.
getcwd
(),
defaultFile
=
""
,
wildcard
=
f
"
{
format
}
文件(*.
{
format
}
)|*.
{
format
}
"
,
style
=
wx
.
FD_SAVE
|
wx
.
FD_OVERWRITE_PROMPT
)
path
=
None
if
dlg
.
ShowModal
()
==
wx
.
ID_OK
:
path
=
dlg
.
GetPath
()
dlg
.
Destroy
()
return
path
def
save_img
(
self
,
evt
):
if
not
hasattr
(
self
,
"pdfDoc"
):
return
dlg
=
wx
.
DirDialog
(
self
,
"选择图片保存的文件夹:"
,
style
=
wx
.
DD_DEFAULT_STYLE
# | wx.DD_DIR_MUST_EXIST
# | wx.DD_CHANGE_DIR
)
mat
=
fitz
.
Matrix
(
1
,
1
)
if
dlg
.
ShowModal
()
==
wx
.
ID_OK
:
path
=
dlg
.
GetPath
()
for
i
in
range
(
self
.
pdfDoc
.
pageCount
):
page
=
self
.
pdfDoc
[
i
]
clip
=
page
.
rect
pix
=
page
.
get_pixmap
(
matrix
=
mat
,
alpha
=
False
,
clip
=
clip
)
if
self
.
rects
:
name
=
self
.
extract_pdf_text
(
page
=
page
,
rect
=
self
.
rects
[
-
1
])
else
:
name
=
f
"p
{
i
:
0
>
3
d
}
"
pix
.
save
(
f
"
{
path
}
/
{
name
}
.png"
)
self
.
g1
.
SetValue
((
i
+
1
)
*
100
//
self
.
pdfDoc
.
pageCount
)
dlg
.
Destroy
()
os
.
system
(
f
"explorer
{
path
}
"
)
def
save_file
(
self
,
evt
):
if
not
hasattr
(
self
,
"pdfDoc"
):
return
path
=
self
.
save_FileDialog
()
if
path
is
None
:
return
data
=
[]
for
i
in
range
(
self
.
pdfDoc
.
pageCount
):
page
=
self
.
pdfDoc
[
i
]
row
=
[
self
.
extract_pdf_text
(
page
,
rect
)
for
i
,
rect
in
enumerate
(
self
.
rects
)]
data
.
append
(
row
)
with
open
(
path
,
"w"
)
as
f
:
writer
=
csv
.
writer
(
f
,
lineterminator
=
"
\n
"
)
row
=
[
f
"区域
{
i
}
"
for
i
in
range
(
1
,
len
(
row
)
+
1
)]
writer
.
writerow
(
row
)
for
row
in
data
:
writer
.
writerow
(
row
)
os
.
system
(
f
"cmd /c start
{
path
}
"
)
def
extract_pdf_text
(
self
,
page
=
None
,
rect
=
None
):
if
page
is
None
:
page
=
self
.
pdfDoc
[
self
.
i
]
if
rect
is
None
:
rect
=
self
.
rects
[
-
1
]
a
,
b
,
c
,
d
=
rect
clip
=
fitz
.
Rect
(
a
,
b
,
a
+
c
,
b
+
d
)
text
=
page
.
get_text
(
clip
=
clip
).
strip
()
return
text
def
change_img
(
self
,
img_path
,
move
=
True
):
self
.
bmp
=
wx
.
Bitmap
(
img_path
)
self
.
SetSize
(
self
.
bmp
.
GetSize
())
self
.
parent
.
SetSize
(
self
.
parent
.
GetBestSize
())
if
move
:
self
.
parent
.
Center
()
def
DoDrawing
(
self
,
evt
):
if
not
hasattr
(
self
,
"bmp"
):
return
dc
=
wx
.
ClientDC
(
self
)
dc
.
DrawBitmap
(
self
.
bmp
,
0
,
0
,
True
)
dc
.
SetPen
(
wx
.
Pen
(
'blue'
))
dc
.
SetBrush
(
wx
.
Brush
(
'white'
,
wx
.
BRUSHSTYLE_TRANSPARENT
))
dc
.
DrawRectangleList
(
self
.
rects
)
def
OnLeftButtonEvent
(
self
,
event
):
if
event
.
LeftDown
():
self
.
x
,
self
.
y
=
event
.
GetPosition
()
self
.
rects
.
append
([
self
.
x
,
self
.
y
,
0
,
0
])
elif
event
.
Dragging
():
x
,
y
=
event
.
GetPosition
()
self
.
rects
[
-
1
][
2
]
=
x
-
self
.
x
self
.
rects
[
-
1
][
3
]
=
y
-
self
.
y
self
.
DoDrawing
(
-
1
)
elif
event
.
LeftUp
():
print
(
self
.
rects
)
if
self
.
rects
[
-
1
][
2
]
<
5
or
self
.
rects
[
-
1
][
3
]
<
5
:
self
.
rects
.
pop
()
else
:
self
.
parent
.
SetTitle
(
self
.
path
+
"|"
+
self
.
extract_pdf_text
())
app
=
wx
.
App
()
frm
=
wx
.
Frame
(
None
)
pnl
=
MyCanvas
(
frm
)
frm
.
Center
()
frm
.
Show
()
frm
.
SetTitle
(
"PDF文本提取器"
)
app
.
MainLoop
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录