Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
OpenDocCN
kaggle-crawler
提交
fdc8274f
K
kaggle-crawler
项目概览
OpenDocCN
/
kaggle-crawler
通知
0
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
K
kaggle-crawler
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
前往新版Gitcode,体验更适合开发者的 AI 搜索 >>
提交
fdc8274f
编写于
6月 27, 2019
作者:
W
wizardforcel
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
init
上级
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
163 addition
and
0 deletion
+163
-0
.gitlab-ci.yml
.gitlab-ci.yml
+35
-0
kaggle.js
kaggle.js
+128
-0
未找到文件。
.gitlab-ci.yml
0 → 100644
浏览文件 @
fdc8274f
image
:
node:7
before_script
:
-
apt install imagemagick
-
apt install pngquant
-
npm install sync-request
-
npm install cheerio
-
npm install gen-epub@git+https://github.com/258ch/gen-epub
job
:
script
:
-
node kaggle $NAME
except
:
-
pushes
-
merge_requests
after_script
:
-
mkdir out
-
mv *.epub out/
-
cd out
-
git init
-
git config user.name ${GL_UN}
-
git config user.email ${GL_EMAIL}
-
git add -A
-
git commit -am "$(date "+%Y-%m-%d %H:%M:%S")"
-
git push "https://${GL_TOKEN}@gitlab.com/${GL_USER}/${GL_REPO}.git" master:${GL_BRANCH} -f
variables
:
GL_UN
:
wizardforcel
GL_EMAIL
:
562826179@qq.com
GL_USER
:
wizardforcel
GL_REPO
:
kaggle-crawler
GL_BRANCH
:
$NAME
\ No newline at end of file
kaggle.js
0 → 100644
浏览文件 @
fdc8274f
/*
npm install sync-request
npm install cheerio
npm install gen-epub@git+https://github.com/258ch/gen-epub
需要 Image Magick 和 pngquant
*/
var
request_
=
require
(
'
sync-request
'
)
var
fs
=
require
(
'
fs
'
)
var
{
URL
}
=
require
(
'
url
'
)
var
cheerio
=
require
(
'
cheerio
'
)
var
genEpub
=
require
(
'
gen-epub
'
)
var
crypto
=
require
(
'
crypto
'
);
var
os
=
require
(
'
os
'
)
var
path
=
require
(
'
path
'
)
var
betterImg
=
require
(
'
./img-better.js
'
)
function
requestWithRetry
(
method
,
url
,
kwargs
,
n
=
5
)
{
for
(
var
i
=
0
;
i
<
n
;
i
++
)
{
try
{
return
request_
(
method
,
url
,
kwargs
)
}
catch
(
ex
)
{
if
(
i
==
n
-
1
)
throw
ex
;
}
}
}
function
processImg
(
html
,
pageUrl
,
imgs
)
{
var
$
=
cheerio
.
load
(
html
);
var
$imgs
=
$
(
'
img
'
);
for
(
var
i
=
0
;
i
<
$imgs
.
length
;
i
++
)
{
try
{
var
$img
=
$imgs
.
eq
(
i
);
var
url
=
$img
.
attr
(
'
src
'
);
if
(
!
url
.
startsWith
(
'
http
'
))
url
=
new
URL
(
url
,
pageUrl
).
toString
()
var
picname
=
crypto
.
createHash
(
'
md5
'
).
update
(
url
).
digest
(
'
hex
'
)
+
"
.jpg
"
;
console
.
log
(
`pic:
${
url
}
=>
${
picname
}
`
)
if
(
!
imgs
.
has
(
picname
))
{
var
data
=
request
(
'
GET
'
,
url
).
getBody
();
data
=
betterImg
(
data
)
imgs
.
set
(
picname
,
data
);
}
$img
.
attr
(
'
src
'
,
'
../Images/
'
+
picname
);
}
catch
(
ex
)
{
console
.
log
(
ex
.
toString
())}
}
return
$
.
html
();
}
var
request
=
requestWithRetry
function
getCode
(
html
)
{
var
code
=
/source":
(
".+
?
"
)
/
.
exec
(
html
)[
1
]
return
'
<pre>
'
+
JSON
.
parse
(
code
)
+
'
</pre>
'
}
function
getContentUrl
(
html
)
{
return
/https
?
:
\/\/
www
\.
kaggleusercontent
\.
com
\/
kf
\/\d
+
\/
.+
?\/
__results__
\.
html/
.
exec
(
html
)[
0
]
}
function
getBody
(
html
)
{
return
cheerio
.
load
(
html
)(
'
body
'
).
html
()
}
function
getToc
(
id
)
{
var
url
=
`https://www.kaggle.com/kernels.json?sortBy=hotness&group=everyone&pageSize=10000&competitionId=
${
id
}
`
var
j
=
request
(
'
GET
'
,
url
).
body
.
toString
()
j
=
JSON
.
parse
(
j
)
return
j
}
function
compToId
(
name
)
{
var
url
=
`https://www.kaggle.com/c/
${
name
}
/kernels`
var
html
=
request
(
'
GET
'
,
url
).
body
.
toString
()
var
id
=
/kaggle
\/(\d
+
)
/
.
exec
(
html
)[
1
]
return
id
}
function
main
()
{
var
name
=
process
.
argv
[
2
]
console
.
log
(
`name:
${
name
}
`
)
var
id
=
compToId
(
name
)
console
.
log
(
`id:
${
id
}
`
)
var
toc
=
getToc
(
id
)
var
articles
=
[]
var
imgs
=
new
Map
()
for
(
var
it
of
toc
)
{
var
prefix
=
'
https://www.kaggle.com
'
var
url
=
prefix
+
it
.
scriptUrl
console
.
log
(
`url:
${
url
}
`
)
var
html
=
request
(
'
GET
'
,
url
).
body
.
toString
()
if
(
it
.
isNotebook
){
var
realUrl
=
getContentUrl
(
html
)
var
co
=
request
(
'
GET
'
,
realUrl
).
body
.
toString
()
co
=
processImg
(
co
,
realUrl
,
imgs
)
co
=
getBody
(
co
)
}
else
{
var
co
=
getCode
(
html
)
}
var
from
=
`<p>From: <a href='
${
url
}
'>
${
url
}
</a></p>`
var
score
=
''
if
(
it
.
bestPublicScore
)
score
=
`<p>Score:
${
it
.
bestPublicScore
}
</p>`
var
au
=
`<p>Author: <a href='
${
prefix
+
it
.
author
.
profileUrl
}
'>
${
it
.
author
.
displayName
}
</a></p>`
co
=
`
${
from
}
\n
${
au
}
\n
${
score
}
\n
${
co
}
`
articles
.
push
({
title
:
it
.
title
,
content
:
co
})
}
articles
.
splice
(
0
,
0
,
{
title
:
`Kaggle Kernel -
${
name
}
`
,
content
:
''
})
genEpub
(
articles
,
imgs
)
}
if
(
module
==
require
.
main
)
main
()
\ No newline at end of file
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录